Mejoras: NER, embeddings, dashboard, docker-compose y limpieza
This commit is contained in:
parent
6c5aff9936
commit
d508dc2058
19 changed files with 2218 additions and 1185 deletions
62
migrations/001_utils_normalize_url.sql
Normal file
62
migrations/001_utils_normalize_url.sql
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
-- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www,
|
||||
-- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.),
|
||||
-- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz.
|
||||
|
||||
CREATE OR REPLACE FUNCTION normalize_url(in_url text)
|
||||
RETURNS text
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
DECLARE
|
||||
u text := trim(in_url);
|
||||
scheme_host text;
|
||||
path_q text;
|
||||
BEGIN
|
||||
IF u IS NULL OR u = '' THEN
|
||||
RETURN NULL;
|
||||
END IF;
|
||||
|
||||
-- quitar espacios y fragmentos
|
||||
u := regexp_replace(u, '#.*$', '', 'i');
|
||||
|
||||
-- separar esquema+host de path+query
|
||||
-- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar
|
||||
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
|
||||
IF scheme_host IS NULL THEN
|
||||
-- si no hay esquema, asumimos http
|
||||
u := 'http://' || u;
|
||||
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
|
||||
END IF;
|
||||
path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$');
|
||||
IF path_q IS NULL THEN
|
||||
path_q := '/';
|
||||
END IF;
|
||||
|
||||
-- normalizar esquema y host (minúsculas, quitar www.)
|
||||
scheme_host := lower(scheme_host);
|
||||
scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i');
|
||||
|
||||
-- quitar puerto por defecto (:80 en http, :443 en https)
|
||||
scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i');
|
||||
scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i');
|
||||
|
||||
-- limpiar parámetros de tracking en la query
|
||||
-- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc.
|
||||
path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi');
|
||||
-- limpiar conectores sobrantes ?, &, &&, ?&, etc.
|
||||
path_q := regexp_replace(path_q, '\?&+', '?', 'g');
|
||||
path_q := regexp_replace(path_q, '&{2,}', '&', 'g');
|
||||
path_q := regexp_replace(path_q, '\?$', '', 'g');
|
||||
path_q := regexp_replace(path_q, '\?$','', 'g');
|
||||
|
||||
-- colapsar dobles barras en path (no tocar “://”)
|
||||
path_q := regexp_replace(path_q, '/{2,}', '/', 'g');
|
||||
|
||||
-- quitar “/” final si no es la raíz
|
||||
IF path_q <> '/' THEN
|
||||
path_q := regexp_replace(path_q, '/+$', '', 'g');
|
||||
END IF;
|
||||
|
||||
RETURN scheme_host || path_q;
|
||||
END;
|
||||
$$;
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue