62 lines
2.2 KiB
PL/PgSQL
62 lines
2.2 KiB
PL/PgSQL
-- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www,
|
|
-- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.),
|
|
-- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz.
|
|
|
|
CREATE OR REPLACE FUNCTION normalize_url(in_url text)
|
|
RETURNS text
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
u text := trim(in_url);
|
|
scheme_host text;
|
|
path_q text;
|
|
BEGIN
|
|
IF u IS NULL OR u = '' THEN
|
|
RETURN NULL;
|
|
END IF;
|
|
|
|
-- quitar espacios y fragmentos
|
|
u := regexp_replace(u, '#.*$', '', 'i');
|
|
|
|
-- separar esquema+host de path+query
|
|
-- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar
|
|
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
|
|
IF scheme_host IS NULL THEN
|
|
-- si no hay esquema, asumimos http
|
|
u := 'http://' || u;
|
|
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
|
|
END IF;
|
|
path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$');
|
|
IF path_q IS NULL THEN
|
|
path_q := '/';
|
|
END IF;
|
|
|
|
-- normalizar esquema y host (minúsculas, quitar www.)
|
|
scheme_host := lower(scheme_host);
|
|
scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i');
|
|
|
|
-- quitar puerto por defecto (:80 en http, :443 en https)
|
|
scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i');
|
|
scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i');
|
|
|
|
-- limpiar parámetros de tracking en la query
|
|
-- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc.
|
|
path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi');
|
|
-- limpiar conectores sobrantes ?, &, &&, ?&, etc.
|
|
path_q := regexp_replace(path_q, '\?&+', '?', 'g');
|
|
path_q := regexp_replace(path_q, '&{2,}', '&', 'g');
|
|
path_q := regexp_replace(path_q, '\?$', '', 'g');
|
|
path_q := regexp_replace(path_q, '\?$','', 'g');
|
|
|
|
-- colapsar dobles barras en path (no tocar “://”)
|
|
path_q := regexp_replace(path_q, '/{2,}', '/', 'g');
|
|
|
|
-- quitar “/” final si no es la raíz
|
|
IF path_q <> '/' THEN
|
|
path_q := regexp_replace(path_q, '/+$', '', 'g');
|
|
END IF;
|
|
|
|
RETURN scheme_host || path_q;
|
|
END;
|
|
$$;
|
|
|