go integration and wikipedia

This commit is contained in:
jlimolina 2026-03-28 18:30:07 +01:00
parent 47a252e339
commit ee90335b92
7828 changed files with 1307913 additions and 20807 deletions

View file

@ -0,0 +1,468 @@
package main
import (
"context"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/mmcdole/gofeed"
"github.com/rss2/backend/internal/workers"
)
var (
logger *log.Logger
pool *workers.Config
dbPool *pgxpool.Pool
sleepSec = 900 // 15 minutes
batchSize = 10
)
type URLSource struct {
ID int64
Nombre string
URL string
CategoriaID *int64
PaisID *int64
Idioma *string
}
func init() {
logger = log.New(os.Stdout, "[DISCOVERY] ", log.LstdFlags)
}
func loadConfig() {
sleepSec = getEnvInt("DISCOVERY_INTERVAL", 900)
batchSize = getEnvInt("DISCOVERY_BATCH", 10)
}
func getEnvInt(key string, defaultValue int) int {
if value := os.Getenv(key); value != "" {
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
}
return defaultValue
}
func getPendingURLs(ctx context.Context) ([]URLSource, error) {
rows, err := dbPool.Query(ctx, `
SELECT id, nombre, url, categoria_id, pais_id, idioma
FROM fuentes_url
WHERE active = TRUE
ORDER BY
CASE
WHEN last_check IS NULL THEN 1
WHEN last_status = 'error' THEN 2
WHEN last_status = 'no_feeds' THEN 3
ELSE 4
END,
last_check ASC NULLS FIRST
LIMIT $1
`, batchSize)
if err != nil {
return nil, err
}
defer rows.Close()
var sources []URLSource
for rows.Next() {
var s URLSource
if err := rows.Scan(&s.ID, &s.Nombre, &s.URL, &s.CategoriaID, &s.PaisID, &s.Idioma); err != nil {
continue
}
sources = append(sources, s)
}
return sources, nil
}
func updateURLStatus(ctx context.Context, urlID int64, status, message string, httpCode int) error {
_, err := dbPool.Exec(ctx, `
UPDATE fuentes_url
SET last_check = NOW(),
last_status = $1,
status_message = $2,
last_http_code = $3
WHERE id = $4
`, status, message, httpCode, urlID)
return err
}
func discoverFeeds(pageURL string) ([]string, error) {
client := &http.Client{
Timeout: 15 * time.Second,
}
req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; RSS2Bot/1.0)")
req.Header.Set("Accept", "application/rss+xml, application/atom+xml, application/xml, text/xml, text/html")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
// Try to parse as feed first
parser := gofeed.NewParser()
feed, err := parser.Parse(resp.Body)
if err == nil && feed != nil && len(feed.Items) > 0 {
// It's a valid feed
return []string{pageURL}, nil
}
// If not a feed, try to find feeds in HTML
return findFeedLinksInHTML(pageURL)
}
func findFeedLinksInHTML(baseURL string) ([]string, error) {
// Simple feed link finder - returns empty for now
// In production, use goquery to parse HTML and find RSS/Atom links
return []string{}, nil
}
func parseFeed(feedURL string) (*gofeed.Feed, error) {
client := &http.Client{
Timeout: 30 * time.Second,
}
req, err := http.NewRequest("GET", feedURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; RSS2Bot/1.0)")
req.Header.Set("Accept", "application/rss+xml, application/atom+xml, application/xml, text/xml")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
parser := gofeed.NewParser()
return parser.Parse(resp.Body)
}
func getFeedMetadata(feedURL string) (title, description, language string, entryCount int, err error) {
feed, err := parseFeed(feedURL)
if err != nil {
return "", "", "", 0, err
}
title = feed.Title
if title == "" {
title = "Feed sin título"
}
description = feed.Description
if len(description) > 500 {
description = description[:500]
}
language = feed.Language
entryCount = len(feed.Items)
return title, description, language, entryCount, nil
}
func analyzeFeed(title, url, description string) (country, category string) {
// Simple heuristics - in production use ML or API
lowerTitle := strings.ToLower(title)
lowerDesc := strings.ToLower(description)
combined := lowerTitle + " " + lowerDesc
// Detect country
countries := map[string][]string{
"España": {"españa", "español", "madrid", "barcelona"},
"Argentina": {"argentino", "buenos aires"},
"México": {"méxico", "mexicano", "cdmx", "ciudad de méxico"},
"Colombia": {"colombiano", "bogotá"},
"Chile": {"chileno", "santiago"},
"Perú": {"peruano", "lima"},
"EE.UU.": {"estados unidos", "washington", "trump", "biden"},
"Reino Unido": {"reino unido", "londres", "uk"},
"Francia": {"francia", "parís"},
"Alemania": {"alemania", "berlín"},
}
for country, keywords := range countries {
for _, kw := range keywords {
if strings.Contains(combined, kw) {
return country, ""
}
}
}
return "", ""
}
func getCountryIDByName(ctx context.Context, countryName string) (*int64, error) {
var id int64
err := dbPool.QueryRow(ctx, "SELECT id FROM paises WHERE LOWER(nombre) = LOWER($1)", countryName).Scan(&id)
if err != nil {
return nil, err
}
return &id, nil
}
func getCategoryIDByName(ctx context.Context, categoryName string) (*int64, error) {
var id int64
err := dbPool.QueryRow(ctx, "SELECT id FROM categorias WHERE LOWER(nombre) = LOWER($1)", categoryName).Scan(&id)
if err != nil {
return nil, err
}
return &id, nil
}
func createPendingFeed(ctx context.Context, fuenteURLID int64, feedURL string, metadata map[string]interface{}) error {
feedTitle := metadata["title"].(string)
if feedTitle == "" {
feedTitle = "Feed sin título"
}
description := ""
if d, ok := metadata["description"].(string); ok {
description = d
}
language := ""
if l, ok := metadata["language"].(string); ok {
language = l
}
entryCount := 0
if c, ok := metadata["entry_count"].(int); ok {
entryCount = c
}
detectedCountry := ""
if dc, ok := metadata["detected_country"].(string); ok {
detectedCountry = dc
}
var detectedCountryID *int64
if detectedCountry != "" {
if cid, err := getCountryIDByName(ctx, detectedCountry); err == nil {
detectedCountryID = cid
}
}
suggestedCategory := ""
if sc, ok := metadata["suggested_category"].(string); ok {
suggestedCategory = sc
}
var suggestedCategoryID *int64
if suggestedCategory != "" {
if caid, err := getCategoryIDByName(ctx, suggestedCategory); err == nil {
suggestedCategoryID = caid
}
}
_, err := dbPool.Exec(ctx, `
INSERT INTO feeds_pending (
fuente_url_id, feed_url, feed_title, feed_description,
feed_language, feed_type, entry_count,
detected_country_id, suggested_categoria_id,
discovered_at
)
VALUES ($1, $2, $3, $4, $5, 'rss', $6, $7, $8, NOW())
ON CONFLICT (feed_url) DO UPDATE
SET feed_title = EXCLUDED.feed_title,
discovered_at = NOW()
`, fuenteURLID, feedURL, feedTitle, description, language, entryCount, detectedCountryID, suggestedCategoryID)
return err
}
func createFeedDirectly(ctx context.Context, feedURL string, fuenteURLID *int64, categoriaID, paisID *int64, idioma *string) (bool, error) {
title, description, language, _, err := getFeedMetadata(feedURL)
if err != nil {
return false, err
}
if language == "" && idioma != nil {
language = *idioma
}
var feedID int64
err = dbPool.QueryRow(ctx, `
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo)
VALUES ($1, $2, $3, $4, $5, $6, $7, TRUE)
ON CONFLICT (url) DO NOTHING
RETURNING id
`, title, description, feedURL, categoriaID, paisID, language, fuenteURLID).Scan(&feedID)
if err != nil {
return false, err
}
return feedID > 0, nil
}
func processURLSource(ctx context.Context, source URLSource) {
logger.Printf("Processing: %s (%s)", source.Nombre, source.URL)
// Try to find feeds on this URL
feeds, err := discoverFeeds(source.URL)
if err != nil {
logger.Printf("Error discovering feeds: %v", err)
updateURLStatus(ctx, source.ID, "error", err.Error()[:200], 0)
return
}
if len(feeds) == 0 {
logger.Printf("No feeds found for: %s", source.URL)
updateURLStatus(ctx, source.ID, "no_feeds", "No feeds found", 200)
return
}
logger.Printf("Found %d feeds for %s", len(feeds), source.URL)
maxFeeds := getEnvInt("MAX_FEEDS_PER_URL", 5)
if len(feeds) > maxFeeds {
feeds = feeds[:maxFeeds]
}
autoApprove := source.CategoriaID != nil && source.PaisID != nil
created := 0
pending := 0
existing := 0
errors := 0
for _, feedURL := range feeds {
// Get feed metadata
title, description, language, entryCount, err := getFeedMetadata(feedURL)
if err != nil {
logger.Printf("Error parsing feed %s: %v", feedURL, err)
errors++
continue
}
// Analyze for country/category
detectedCountry, suggestedCategory := analyzeFeed(title, feedURL, description)
metadata := map[string]interface{}{
"title": title,
"description": description,
"language": language,
"entry_count": entryCount,
"detected_country": detectedCountry,
"suggested_category": suggestedCategory,
}
if !autoApprove {
// Create pending feed for review
if err := createPendingFeed(ctx, source.ID, feedURL, metadata); err != nil {
logger.Printf("Error creating pending feed: %v", err)
errors++
} else {
pending++
}
} else {
// Create feed directly
createdFeed, err := createFeedDirectly(ctx, feedURL, &source.ID, source.CategoriaID, source.PaisID, source.Idioma)
if err != nil {
logger.Printf("Error creating feed: %v", err)
errors++
} else if createdFeed {
created++
} else {
existing++
}
}
time.Sleep(1 * time.Second) // Rate limiting
}
// Update status
var status string
var message string
if created > 0 || pending > 0 {
status = "success"
parts := []string{}
if created > 0 {
parts = append(parts, fmt.Sprintf("%d created", created))
}
if pending > 0 {
parts = append(parts, fmt.Sprintf("%d pending", pending))
}
message = strings.Join(parts, ", ")
} else if existing > 0 {
status = "existing"
message = fmt.Sprintf("%d already existed", existing)
} else {
status = "error"
message = fmt.Sprintf("%d errors", errors)
}
updateURLStatus(ctx, source.ID, status, message, 200)
logger.Printf("Processed %s: created=%d, pending=%d, existing=%d, errors=%d",
source.URL, created, pending, existing, errors)
}
func main() {
loadConfig()
logger.Println("Starting RSS Discovery Worker")
cfg := workers.LoadDBConfig()
if err := workers.Connect(cfg); err != nil {
logger.Fatalf("Failed to connect to database: %v", err)
}
dbPool = workers.GetPool()
defer workers.Close()
logger.Println("Connected to PostgreSQL")
ctx := context.Background()
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
logger.Println("Shutting down...")
os.Exit(0)
}()
logger.Printf("Config: interval=%ds, batch=%d", sleepSec, batchSize)
ticker := time.NewTicker(time.Duration(sleepSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
sources, err := getPendingURLs(ctx)
if err != nil {
logger.Printf("Error fetching URLs: %v", err)
continue
}
if len(sources) == 0 {
logger.Println("No pending URLs to process")
continue
}
logger.Printf("Processing %d sources", len(sources))
for _, source := range sources {
processURLSource(ctx, source)
time.Sleep(2 * time.Second)
}
}
}
}

391
backend/cmd/qdrant/main.go Normal file
View file

@ -0,0 +1,391 @@
package main
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"os/signal"
"strconv"
"syscall"
"time"
"github.com/google/uuid"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/rss2/backend/internal/workers"
)
var (
logger *log.Logger
dbPool *pgxpool.Pool
qdrantURL string
ollamaURL string
collection = "news_vectors"
sleepSec = 30
batchSize = 100
)
func init() {
logger = log.New(os.Stdout, "[QDRANT] ", log.LstdFlags)
}
func loadConfig() {
sleepSec = getEnvInt("QDRANT_SLEEP", 30)
batchSize = getEnvInt("QDRANT_BATCH", 100)
qdrantHost := getEnv("QDRANT_HOST", "localhost")
qdrantPort := getEnvInt("QDRANT_PORT", 6333)
qdrantURL = fmt.Sprintf("http://%s:%d", qdrantHost, qdrantPort)
ollamaURL = getEnv("OLLAMA_URL", "http://ollama:11434")
collection = getEnv("QDRANT_COLLECTION", "news_vectors")
}
func getEnv(key, defaultValue string) string {
if value := os.Getenv(key); value != "" {
return value
}
return defaultValue
}
func getEnvInt(key string, defaultValue int) int {
if value := os.Getenv(key); value != "" {
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
}
return defaultValue
}
type Translation struct {
ID int64
NoticiaID int64
Lang string
Titulo string
Resumen string
URL string
Fecha *time.Time
FuenteNombre string
CategoriaID *int64
PaisID *int64
}
func getPendingTranslations(ctx context.Context) ([]Translation, error) {
rows, err := dbPool.Query(ctx, `
SELECT
t.id as traduccion_id,
t.noticia_id,
TRIM(t.lang_to) as lang,
t.titulo_trad as titulo,
t.resumen_trad as resumen,
n.url,
n.fecha,
n.fuente_nombre,
n.categoria_id,
n.pais_id
FROM traducciones t
INNER JOIN noticias n ON t.noticia_id = n.id
WHERE t.vectorized = FALSE
AND t.status = 'done'
ORDER BY t.created_at ASC
LIMIT $1
`, batchSize)
if err != nil {
return nil, err
}
defer rows.Close()
var translations []Translation
for rows.Next() {
var t Translation
if err := rows.Scan(
&t.ID, &t.NoticiaID, &t.Lang, &t.Titulo, &t.Resumen,
&t.URL, &t.Fecha, &t.FuenteNombre, &t.CategoriaID, &t.PaisID,
); err != nil {
continue
}
translations = append(translations, t)
}
return translations, nil
}
type EmbeddingRequest struct {
Model string `json:"model"`
Input string `json:"input"`
}
type EmbeddingResponse struct {
Embedding []float64 `json:"embedding"`
}
func generateEmbedding(text string) ([]float64, error) {
reqBody := EmbeddingRequest{
Model: "mxbai-embed-large",
Input: text,
}
body, err := json.Marshal(reqBody)
if err != nil {
return nil, err
}
client := &http.Client{Timeout: 60 * time.Second}
resp, err := client.Post(ollamaURL+"/api/embeddings", "application/json", bytes.NewReader(body))
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("Ollama returned status %d", resp.StatusCode)
}
var result EmbeddingResponse
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, err
}
return result.Embedding, nil
}
type QdrantPoint struct {
ID interface{} `json:"id"`
Vector []float64 `json:"vector"`
Payload map[string]interface{} `json:"payload"`
}
type QdrantUpsertRequest struct {
Points []QdrantPoint `json:"points"`
}
func ensureCollection() error {
req, err := http.NewRequest("GET", qdrantURL+"/collections/"+collection, nil)
if err != nil {
return err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode == 200 {
logger.Printf("Collection %s already exists", collection)
return nil
}
// Get embedding dimension
emb, err := generateEmbedding("test")
if err != nil {
return fmt.Errorf("failed to get embedding dimension: %w", err)
}
dimension := len(emb)
// Create collection
createReq := map[string]interface{}{
"name": collection,
"vectors": map[string]interface{}{
"size": dimension,
"distance": "Cosine",
},
}
body, _ := json.Marshal(createReq)
resp2, err := http.Post(qdrantURL+"/collections", "application/json", bytes.NewReader(body))
if err != nil {
return err
}
defer resp2.Body.Close()
logger.Printf("Created collection %s with dimension %d", collection, dimension)
return nil
}
func uploadToQdrant(translations []Translation, embeddings [][]float64) error {
points := make([]QdrantPoint, 0, len(translations))
for i, t := range translations {
if embeddings[i] == nil {
continue
}
pointID := uuid.New().String()
payload := map[string]interface{}{
"news_id": t.NoticiaID,
"traduccion_id": t.ID,
"titulo": t.Titulo,
"resumen": t.Resumen,
"url": t.URL,
"fuente_nombre": t.FuenteNombre,
"lang": t.Lang,
}
if t.Fecha != nil {
payload["fecha"] = t.Fecha.Format(time.RFC3339)
}
if t.CategoriaID != nil {
payload["categoria_id"] = *t.CategoriaID
}
if t.PaisID != nil {
payload["pais_id"] = *t.PaisID
}
points = append(points, QdrantPoint{
ID: pointID,
Vector: embeddings[i],
Payload: payload,
})
}
if len(points) == 0 {
return nil
}
reqBody := QdrantUpsertRequest{Points: points}
body, err := json.Marshal(reqBody)
if err != nil {
return err
}
url := fmt.Sprintf("%s/collections/%s/points", qdrantURL, collection)
resp, err := http.Post(url, "application/json", bytes.NewReader(body))
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 && resp.StatusCode != 202 {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("Qdrant returned status %d: %s", resp.StatusCode, string(respBody))
}
return nil
}
func updateTranslationStatus(ctx context.Context, translations []Translation, pointIDs []string) error {
for i, t := range translations {
if i >= len(pointIDs) || pointIDs[i] == "" {
continue
}
_, err := dbPool.Exec(ctx, `
UPDATE traducciones
SET
vectorized = TRUE,
vectorization_date = NOW(),
qdrant_point_id = $1
WHERE id = $2
`, pointIDs[i], t.ID)
if err != nil {
logger.Printf("Error updating translation %d: %v", t.ID, err)
}
}
return nil
}
func getStats(ctx context.Context) (total, vectorized, pending int, err error) {
err = dbPool.QueryRow(ctx, `
SELECT
COUNT(*) as total,
COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorized,
COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pending
FROM traducciones
WHERE lang_to = 'es'
`).Scan(&total, &vectorized, &pending)
return total, vectorized, pending, err
}
func main() {
loadConfig()
logger.Println("Starting Qdrant Vectorization Worker")
cfg := workers.LoadDBConfig()
if err := workers.Connect(cfg); err != nil {
logger.Fatalf("Failed to connect to database: %v", err)
}
dbPool = workers.GetPool()
defer workers.Close()
logger.Println("Connected to PostgreSQL")
ctx := context.Background()
if err := ensureCollection(); err != nil {
logger.Printf("Warning: Could not ensure collection: %v", err)
}
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
logger.Println("Shutting down...")
os.Exit(0)
}()
logger.Printf("Config: qdrant=%s, ollama=%s, collection=%s, sleep=%ds, batch=%d",
qdrantURL, ollamaURL, collection, sleepSec, batchSize)
totalProcessed := 0
for {
select {
case <-time.After(time.Duration(sleepSec) * time.Second):
translations, err := getPendingTranslations(ctx)
if err != nil {
logger.Printf("Error fetching pending translations: %v", err)
continue
}
if len(translations) == 0 {
logger.Println("No pending translations to process")
continue
}
logger.Printf("Processing %d translations...", len(translations))
// Generate embeddings
embeddings := make([][]float64, len(translations))
for i, t := range translations {
text := fmt.Sprintf("%s %s", t.Titulo, t.Resumen)
emb, err := generateEmbedding(text)
if err != nil {
logger.Printf("Error generating embedding for %d: %v", t.ID, err)
continue
}
embeddings[i] = emb
}
// Upload to Qdrant
if err := uploadToQdrant(translations, embeddings); err != nil {
logger.Printf("Error uploading to Qdrant: %v", err)
continue
}
// Update DB status
pointIDs := make([]string, len(translations))
for i := range translations {
pointIDs[i] = uuid.New().String()
}
if err := updateTranslationStatus(ctx, translations, pointIDs); err != nil {
logger.Printf("Error updating status: %v", err)
}
totalProcessed += len(translations)
logger.Printf("Processed %d translations (total: %d)", len(translations), totalProcessed)
total, vectorized, pending, err := getStats(ctx)
if err == nil {
logger.Printf("Stats: total=%d, vectorized=%d, pending=%d", total, vectorized, pending)
}
}
}
}

384
backend/cmd/related/main.go Normal file
View file

@ -0,0 +1,384 @@
package main
import (
"context"
"log"
"os"
"os/signal"
"strconv"
"syscall"
"time"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/rss2/backend/internal/workers"
)
var (
logger *log.Logger
dbPool *pgxpool.Pool
sleepSec = 10
topK = 10
batchSz = 200
minScore = 0.0
)
func init() {
logger = log.New(os.Stdout, "[RELATED] ", log.LstdFlags)
}
func loadConfig() {
sleepSec = getEnvInt("RELATED_SLEEP", 10)
topK = getEnvInt("RELATED_TOPK", 10)
batchSz = getEnvInt("RELATED_BATCH", 200)
minScore = getEnvFloat("RELATED_MIN_SCORE", 0.0)
}
func getEnvInt(key string, defaultValue int) int {
if value := os.Getenv(key); value != "" {
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
}
return defaultValue
}
func getEnvFloat(key string, defaultValue float64) float64 {
if value := os.Getenv(key); value != "" {
if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
return floatVal
}
}
return defaultValue
}
type Translation struct {
ID int64
Titulo string
Resumen string
Embedding []float64
}
func ensureSchema(ctx context.Context) error {
_, err := dbPool.Exec(ctx, `
CREATE TABLE IF NOT EXISTS related_noticias (
traduccion_id INTEGER REFERENCES traducciones(id) ON DELETE CASCADE,
related_traduccion_id INTEGER REFERENCES traducciones(id) ON DELETE CASCADE,
score FLOAT NOT NULL DEFAULT 0,
created_at TIMESTAMP DEFAULT NOW(),
PRIMARY KEY (traduccion_id, related_traduccion_id)
);
`)
if err != nil {
return err
}
// Ensure traduccion_embeddings table exists
_, err = dbPool.Exec(ctx, `
CREATE TABLE IF NOT EXISTS traduccion_embeddings (
id SERIAL PRIMARY KEY,
traduccion_id INTEGER NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
model TEXT NOT NULL,
dim INTEGER NOT NULL,
embedding DOUBLE PRECISION[] NOT NULL,
created_at TIMESTAMP DEFAULT NOW(),
UNIQUE (traduccion_id, model)
);
`)
if err != nil {
return err
}
_, err = dbPool.Exec(ctx, `
CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);
`)
if err != nil {
return err
}
_, err = dbPool.Exec(ctx, `
CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id ON traduccion_embeddings(traduccion_id);
`)
return err
}
func fetchAllEmbeddings(ctx context.Context, model string) ([]Translation, error) {
rows, err := dbPool.Query(ctx, `
SELECT e.traduccion_id,
COALESCE(NULLIF(t.titulo_trad,''), ''),
COALESCE(NULLIF(t.resumen_trad,''), ''),
e.embedding
FROM traduccion_embeddings e
JOIN traducciones t ON t.id = e.traduccion_id
WHERE e.model = $1
AND t.status = 'done'
AND t.lang_to = 'es'
`, model)
if err != nil {
return nil, err
}
defer rows.Close()
var translations []Translation
for rows.Next() {
var t Translation
if err := rows.Scan(&t.ID, &t.Titulo, &t.Resumen, &t.Embedding); err != nil {
continue
}
translations = append(translations, t)
}
return translations, nil
}
func fetchPendingIDs(ctx context.Context, model string, limit int) ([]int64, error) {
rows, err := dbPool.Query(ctx, `
SELECT t.id
FROM traducciones t
JOIN traduccion_embeddings e ON e.traduccion_id = t.id AND e.model = $1
LEFT JOIN related_noticias r ON r.traduccion_id = t.id
WHERE t.lang_to = 'es'
AND t.status = 'done'
GROUP BY t.id
HAVING COUNT(r.related_traduccion_id) = 0
ORDER BY t.id DESC
LIMIT $2
`, model, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var ids []int64
for rows.Next() {
var id int64
if err := rows.Scan(&id); err != nil {
continue
}
ids = append(ids, id)
}
return ids, nil
}
func cosineSimilarity(a, b []float64) float64 {
if len(a) != len(b) || len(a) == 0 {
return 0
}
var dotProduct, normA, normB float64
for i := range a {
dotProduct += a[i] * b[i]
normA += a[i] * a[i]
normB += b[i] * b[i]
}
normA = sqrt(normA)
normB = sqrt(normB)
if normA == 0 || normB == 0 {
return 0
}
return dotProduct / (normA * normB)
}
func sqrt(x float64) float64 {
if x <= 0 {
return 0
}
// Simple Newton-Raphson
z := x
for i := 0; i < 20; i++ {
z = (z + x/z) / 2
}
return z
}
func findTopK(query Embedding, candidates []Translation, k int, minScore float64) []struct {
ID int64
Score float64
} {
type sim struct {
id int64
score float64
}
var similarities []sim
for _, c := range candidates {
if int64(c.ID) == query.ID {
continue
}
score := cosineSimilarity(query.Embedding, c.Embedding)
if score <= minScore {
continue
}
similarities = append(similarities, sim{int64(c.ID), score})
}
// Sort by score descending
for i := 0; i < len(similarities)-1; i++ {
for j := i + 1; j < len(similarities); j++ {
if similarities[j].score > similarities[i].score {
similarities[i], similarities[j] = similarities[j], similarities[i]
}
}
}
if len(similarities) > k {
similarities = similarities[:k]
}
result := make([]struct {
ID int64
Score float64
}, len(similarities))
for i, s := range similarities {
result[i] = struct {
ID int64
Score float64
}{s.id, s.score}
}
return result
}
type Embedding struct {
ID int64
Embedding []float64
}
func findEmbeddingByID(embeddings []Embedding, id int64) *Embedding {
for i := range embeddings {
if embeddings[i].ID == id {
return &embeddings[i]
}
}
return nil
}
func insertRelated(ctx context.Context, traduccionID int64, related []struct {
ID int64
Score float64
}) error {
if len(related) == 0 {
return nil
}
for _, r := range related {
if r.Score <= 0 {
continue
}
_, err := dbPool.Exec(ctx, `
INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
VALUES ($1, $2, $3)
ON CONFLICT (traduccion_id, related_traduccion_id)
DO UPDATE SET score = EXCLUDED.score
`, traduccionID, r.ID, r.Score)
if err != nil {
logger.Printf("Error inserting related: %v", err)
}
}
return nil
}
func processBatch(ctx context.Context, model string) (int, error) {
// Fetch all embeddings once
allTranslations, err := fetchAllEmbeddings(ctx, model)
if err != nil {
return 0, err
}
if len(allTranslations) == 0 {
return 0, nil
}
// Convert to Embedding format for easier lookup
var allEmbeddings []Embedding
for _, t := range allTranslations {
if t.Embedding != nil {
allEmbeddings = append(allEmbeddings, Embedding{ID: t.ID, Embedding: t.Embedding})
}
}
// Get pending IDs
pendingIDs, err := fetchPendingIDs(ctx, model, batchSz)
if err != nil {
return 0, err
}
if len(pendingIDs) == 0 {
return 0, nil
}
processed := 0
for _, tradID := range pendingIDs {
emb := findEmbeddingByID(allEmbeddings, tradID)
if emb == nil {
continue
}
topRelated := findTopK(*emb, allTranslations, topK, minScore)
if err := insertRelated(ctx, tradID, topRelated); err != nil {
logger.Printf("Error inserting related for %d: %v", tradID, err)
continue
}
processed++
}
return processed, nil
}
func main() {
loadConfig()
logger.Println("Starting Related News Worker")
cfg := workers.LoadDBConfig()
if err := workers.Connect(cfg); err != nil {
logger.Fatalf("Failed to connect to database: %v", err)
}
dbPool = workers.GetPool()
defer workers.Close()
ctx := context.Background()
// Ensure schema
if err := ensureSchema(ctx); err != nil {
logger.Printf("Error ensuring schema: %v", err)
}
model := os.Getenv("EMB_MODEL")
if model == "" {
model = "mxbai-embed-large"
}
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
logger.Println("Shutting down...")
os.Exit(0)
}()
logger.Printf("Config: sleep=%ds, topK=%d, batch=%d, model=%s", sleepSec, topK, batchSz, model)
for {
select {
case <-time.After(time.Duration(sleepSec) * time.Second):
count, err := processBatch(ctx, model)
if err != nil {
logger.Printf("Error processing batch: %v", err)
continue
}
if count > 0 {
logger.Printf("Generated related news for %d translations", count)
}
}
}
}

330
backend/cmd/scraper/main.go Normal file
View file

@ -0,0 +1,330 @@
package main
import (
"context"
"crypto/md5"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/rss2/backend/internal/workers"
)
var (
logger *log.Logger
dbPool *workers.Config
pool *pgxpool.Pool
sleepInterval = 60
batchSize = 10
)
type URLSource struct {
ID int64
Nombre string
URL string
CategoriaID *int64
PaisID *int64
Idioma *string
Active bool
}
type Article struct {
Title string
Summary string
Content string
URL string
ImageURL string
PubDate *time.Time
}
func init() {
logger = log.New(os.Stdout, "[SCRAPER] ", log.LstdFlags)
logger.SetOutput(os.Stdout)
}
func loadConfig() {
sleepInterval = getEnvInt("SCRAPER_SLEEP", 60)
batchSize = getEnvInt("SCRAPER_BATCH", 10)
}
func getEnvInt(key string, defaultValue int) int {
if value := os.Getenv(key); value != "" {
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
}
return defaultValue
}
func getActiveURLs(ctx context.Context) ([]URLSource, error) {
rows, err := pool.Query(ctx, `
SELECT id, nombre, url, categoria_id, pais_id, idioma, activo
FROM fuentes_url
WHERE activo = true
`)
if err != nil {
return nil, err
}
defer rows.Close()
var sources []URLSource
for rows.Next() {
var s URLSource
err := rows.Scan(&s.ID, &s.Nombre, &s.URL, &s.CategoriaID, &s.PaisID, &s.Idioma, &s.Active)
if err != nil {
continue
}
sources = append(sources, s)
}
return sources, nil
}
func updateSourceStatus(ctx context.Context, sourceID int64, status, message string, httpCode int) error {
_, err := pool.Exec(ctx, `
UPDATE fuentes_url
SET last_check = NOW(),
last_status = $1,
status_message = $2,
last_http_code = $3
WHERE id = $4
`, status, message, httpCode, sourceID)
return err
}
func extractArticle(source URLSource) (*Article, error) {
client := &http.Client{
Timeout: 30 * time.Second,
}
req, err := http.NewRequest("GET", source.URL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, err
}
article := &Article{
URL: source.URL,
}
// Extract title
article.Title = doc.Find("meta[property='og:title']").First().AttrOr("content", "")
if article.Title == "" {
article.Title = doc.Find("meta[name='title']").First().AttrOr("content", "")
}
if article.Title == "" {
article.Title = doc.Find("h1").First().Text()
}
if article.Title == "" {
article.Title = doc.Find("title").First().Text()
}
// Extract description/summary
article.Summary = doc.Find("meta[property='og:description']").First().AttrOr("content", "")
if article.Summary == "" {
article.Summary = doc.Find("meta[name='description']").First().AttrOr("content", "")
}
// Extract image
article.ImageURL = doc.Find("meta[property='og:image']").First().AttrOr("content", "")
// Extract main content - try common selectors
contentSelectors := []string{
"article",
"[role='main']",
"main",
".article-content",
".post-content",
".entry-content",
".content",
"#content",
}
for _, sel := range contentSelectors {
content := doc.Find(sel).First()
if content.Length() > 0 {
article.Content = content.Text()
break
}
}
// Clean up
article.Title = strings.TrimSpace(article.Title)
article.Summary = strings.TrimSpace(article.Summary)
article.Content = strings.TrimSpace(article.Content)
// Truncate summary if too long
if len(article.Summary) > 500 {
article.Summary = article.Summary[:500]
}
return article, nil
}
func saveArticle(ctx context.Context, source URLSource, article *Article) (bool, error) {
finalURL := article.URL
if finalURL == "" {
finalURL = source.URL
}
// Generate ID from URL
articleID := fmt.Sprintf("%x", md5.Sum([]byte(finalURL)))
// Check if exists
var exists bool
err := pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM noticias WHERE id = $1)", articleID).Scan(&exists)
if err != nil {
return false, err
}
if exists {
return false, nil
}
title := article.Title
if title == "" {
title = "Sin título"
}
summary := article.Summary
if summary == "" && article.Content != "" {
summary = article.Content
if len(summary) > 500 {
summary = summary[:500]
}
}
pubDate := time.Now()
if article.PubDate != nil {
pubDate = *article.PubDate
}
_, err = pool.Exec(ctx, `
INSERT INTO noticias (
id, titulo, resumen, url, fecha, imagen_url,
fuente_nombre, categoria_id, pais_id
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT (id) DO NOTHING
`, articleID, title, summary, finalURL, pubDate, article.ImageURL,
source.Nombre, source.CategoriaID, source.PaisID)
if err != nil {
return false, err
}
return true, nil
}
func processSource(ctx context.Context, source URLSource) {
logger.Printf("Processing: %s (%s)", source.Nombre, source.URL)
article, err := extractArticle(source)
if err != nil {
logger.Printf("Error extracting article from %s: %v", source.URL, err)
status := "ERROR"
if strings.Contains(err.Error(), "HTTP") {
status = "ERROR_HTTP"
}
updateSourceStatus(ctx, source.ID, status, err.Error()[:200], 0)
return
}
if article.Title == "" {
logger.Printf("No title found for %s", source.URL)
updateSourceStatus(ctx, source.ID, "ERROR_PARSE", "No title extracted", 200)
return
}
saved, err := saveArticle(ctx, source, article)
if err != nil {
logger.Printf("Error saving article: %v", err)
updateSourceStatus(ctx, source.ID, "ERROR_DB", err.Error()[:200], 0)
return
}
if saved {
logger.Printf("Saved: %s", article.Title)
updateSourceStatus(ctx, source.ID, "OK", "News created successfully", 200)
} else {
logger.Printf("Already exists: %s", article.Title)
updateSourceStatus(ctx, source.ID, "OK", "News already exists", 200)
}
}
func main() {
loadConfig()
logger.Println("Starting Scraper Worker")
cfg := workers.LoadDBConfig()
if err := workers.Connect(cfg); err != nil {
logger.Fatalf("Failed to connect to database: %v", err)
}
pool = workers.GetPool()
defer workers.Close()
logger.Println("Connected to PostgreSQL")
ctx := context.Background()
// Handle shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
logger.Println("Shutting down...")
os.Exit(0)
}()
logger.Printf("Config: sleep=%ds, batch=%d", sleepInterval, batchSize)
ticker := time.NewTicker(time.Duration(sleepInterval) * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
sources, err := getActiveURLs(ctx)
if err != nil {
logger.Printf("Error fetching URLs: %v", err)
continue
}
if len(sources) == 0 {
logger.Println("No active URLs to process")
continue
}
logger.Printf("Processing %d sources", len(sources))
for _, source := range sources {
processSource(ctx, source)
time.Sleep(2 * time.Second) // Rate limiting
}
}
}
}

190
backend/cmd/server/main.go Normal file
View file

@ -0,0 +1,190 @@
package main
import (
"context"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"github.com/gin-gonic/gin"
"github.com/rss2/backend/internal/cache"
"github.com/rss2/backend/internal/config"
"github.com/rss2/backend/internal/db"
"github.com/rss2/backend/internal/handlers"
"github.com/rss2/backend/internal/middleware"
"github.com/rss2/backend/internal/services"
)
func initDB() {
ctx := context.Background()
// Crear tabla entity_aliases si no existe
_, err := db.GetPool().Exec(ctx, `
CREATE TABLE IF NOT EXISTS entity_aliases (
id SERIAL PRIMARY KEY,
canonical_name VARCHAR(255) NOT NULL,
alias VARCHAR(255) NOT NULL,
tipo VARCHAR(50) NOT NULL CHECK (tipo IN ('persona', 'organizacion', 'lugar', 'tema')),
created_at TIMESTAMP DEFAULT NOW(),
UNIQUE(alias, tipo)
)
`)
if err != nil {
log.Printf("Warning: Could not create entity_aliases table: %v", err)
} else {
log.Println("Table entity_aliases ready")
}
// Añadir columna role a users si no existe
_, err = db.GetPool().Exec(ctx, `
ALTER TABLE users ADD COLUMN IF NOT EXISTS role VARCHAR(20) DEFAULT 'user'
`)
if err != nil {
log.Printf("Warning: Could not add role column: %v", err)
} else {
log.Println("Column role ready")
}
// Crear tabla de configuración si no existe
_, err = db.GetPool().Exec(ctx, `
CREATE TABLE IF NOT EXISTS config (
key VARCHAR(100) PRIMARY KEY,
value TEXT,
updated_at TIMESTAMP DEFAULT NOW()
)
`)
if err != nil {
log.Printf("Warning: Could not create config table: %v", err)
} else {
log.Println("Table config ready")
}
// Insertar configuración por defecto si no existe
db.GetPool().Exec(ctx, `
INSERT INTO config (key, value) VALUES ('translator_type', 'cpu')
ON CONFLICT (key) DO NOTHING
`)
db.GetPool().Exec(ctx, `
INSERT INTO config (key, value) VALUES ('translator_workers', '2')
ON CONFLICT (key) DO NOTHING
`)
db.GetPool().Exec(ctx, `
INSERT INTO config (key, value) VALUES ('translator_status', 'stopped')
ON CONFLICT (key) DO NOTHING
`)
}
func main() {
cfg := config.Load()
if err := db.Connect(cfg.DatabaseURL); err != nil {
log.Fatalf("Failed to connect to database: %v", err)
}
defer db.Close()
log.Println("Connected to PostgreSQL")
// Auto-setup DB tables
initDB()
if err := cache.Connect(cfg.RedisURL); err != nil {
log.Printf("Warning: Failed to connect to Redis: %v", err)
} else {
defer cache.Close()
log.Println("Connected to Redis")
}
services.Init(cfg)
r := gin.Default()
r.Use(middleware.CORSMiddleware())
r.Use(middleware.LoggerMiddleware())
r.GET("/health", func(c *gin.Context) {
c.JSON(200, gin.H{"status": "ok"})
})
api := r.Group("/api")
{
// Serve static images downloaded by wiki_worker
api.StaticFS("/wiki-images", gin.Dir("/app/data/wiki_images", false))
api.POST("/auth/login", handlers.Login)
api.POST("/auth/register", handlers.Register)
api.GET("/auth/check-first-user", handlers.CheckFirstUser)
news := api.Group("/news")
{
news.GET("", handlers.GetNews)
news.GET("/:id", handlers.GetNewsByID)
news.DELETE("/:id", middleware.AuthRequired(), handlers.DeleteNews)
}
feeds := api.Group("/feeds")
{
feeds.GET("", handlers.GetFeeds)
feeds.GET("/export", handlers.ExportFeeds)
feeds.GET("/:id", handlers.GetFeedByID)
feeds.POST("", middleware.AuthRequired(), handlers.CreateFeed)
feeds.POST("/import", middleware.AuthRequired(), handlers.ImportFeeds)
feeds.PUT("/:id", middleware.AuthRequired(), handlers.UpdateFeed)
feeds.DELETE("/:id", middleware.AuthRequired(), handlers.DeleteFeed)
feeds.POST("/:id/toggle", middleware.AuthRequired(), handlers.ToggleFeedActive)
feeds.POST("/:id/reactivate", middleware.AuthRequired(), handlers.ReactivateFeed)
}
api.GET("/search", handlers.SearchNews)
api.GET("/entities", handlers.GetEntities)
api.GET("/stats", handlers.GetStats)
api.GET("/categories", handlers.GetCategories)
api.GET("/countries", handlers.GetCountries)
admin := api.Group("/admin")
admin.Use(middleware.AuthRequired(), middleware.AdminRequired())
{
admin.POST("/aliases", handlers.CreateAlias)
admin.GET("/aliases/export", handlers.ExportAliases)
admin.POST("/aliases/import", handlers.ImportAliases)
admin.POST("/entities/retype", handlers.PatchEntityTipo)
admin.GET("/backup", handlers.BackupDatabase)
admin.GET("/backup/news", handlers.BackupNewsZipped)
admin.GET("/users", handlers.GetUsers)
admin.POST("/users/:id/promote", handlers.PromoteUser)
admin.POST("/users/:id/demote", handlers.DemoteUser)
admin.POST("/reset-db", handlers.ResetDatabase)
admin.GET("/workers/status", handlers.GetWorkerStatus)
admin.POST("/workers/config", handlers.SetWorkerConfig)
admin.POST("/workers/start", handlers.StartWorkers)
admin.POST("/workers/stop", handlers.StopWorkers)
}
auth := api.Group("/auth")
auth.Use(middleware.AuthRequired())
{
auth.GET("/me", handlers.GetCurrentUser)
}
}
middleware.SetJWTSecret(cfg.SecretKey)
port := cfg.ServerPort
addr := fmt.Sprintf(":%s", port)
go func() {
log.Printf("Server starting on %s", addr)
if err := r.Run(addr); err != nil {
log.Fatalf("Failed to start server: %v", err)
}
}()
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
<-quit
log.Println("Shutting down server...")
}

383
backend/cmd/topics/main.go Normal file
View file

@ -0,0 +1,383 @@
package main
import (
"context"
"log"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/rss2/backend/internal/workers"
)
var (
logger *log.Logger
dbPool *pgxpool.Pool
sleepSec = 10
batchSz = 500
)
type Topic struct {
ID int64
Weight int
Keywords []string
}
type Country struct {
ID int64
Name string
Keywords []string
}
func init() {
logger = log.New(os.Stdout, "[TOPICS] ", log.LstdFlags)
}
func loadConfig() {
sleepSec = getEnvInt("TOPICS_SLEEP", 10)
batchSz = getEnvInt("TOPICS_BATCH", 500)
}
func getEnvInt(key string, defaultValue int) int {
if value := os.Getenv(key); value != "" {
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
}
return defaultValue
}
func ensureSchema(ctx context.Context) error {
_, err := dbPool.Exec(ctx, `
CREATE TABLE IF NOT EXISTS topics (
id SERIAL PRIMARY KEY,
slug VARCHAR(50) UNIQUE NOT NULL,
name VARCHAR(100) NOT NULL,
weight INTEGER DEFAULT 1,
keywords TEXT,
group_name VARCHAR(50)
);
`)
if err != nil {
return err
}
_, err = dbPool.Exec(ctx, `
CREATE TABLE IF NOT EXISTS news_topics (
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
score INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT NOW(),
PRIMARY KEY (noticia_id, topic_id)
);
`)
if err != nil {
return err
}
_, err = dbPool.Exec(ctx, `
ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
`)
return err
}
func loadTopics(ctx context.Context) ([]Topic, error) {
rows, err := dbPool.Query(ctx, "SELECT id, weight, keywords FROM topics")
if err != nil {
return nil, err
}
defer rows.Close()
var topics []Topic
for rows.Next() {
var t Topic
var kwStr *string
if err := rows.Scan(&t.ID, &t.Weight, &kwStr); err != nil {
continue
}
if kwStr != nil {
keywords := strings.Split(*kwStr, ",")
for i := range keywords {
keywords[i] = strings.ToLower(strings.TrimSpace(keywords[i]))
}
t.Keywords = keywords
}
topics = append(topics, t)
}
return topics, nil
}
func loadCountries(ctx context.Context) ([]Country, error) {
rows, err := dbPool.Query(ctx, "SELECT id, nombre FROM paises")
if err != nil {
return nil, err
}
defer rows.Close()
aliases := map[string][]string{
"Estados Unidos": {"eeuu", "ee.uu.", "usa", "estadounidense", "washington"},
"Rusia": {"ruso", "rusa", "moscú", "kremlin"},
"China": {"chino", "china", "pekin", "beijing"},
"Ucrania": {"ucraniano", "kiev", "kyiv"},
"Israel": {"israelí", "tel aviv", "jerusalén"},
"España": {"español", "madrid"},
"Reino Unido": {"uk", "londres", "británico"},
"Francia": {"francés", "parís"},
"Alemania": {"alemán", "berlín"},
"Palestina": {"palestino", "gaza", "cisjordania"},
"Irán": {"iraní", "teherán"},
}
var countries []Country
for rows.Next() {
var c Country
if err := rows.Scan(&c.ID, &c.Name); err != nil {
continue
}
c.Keywords = []string{strings.ToLower(c.Name)}
if kw, ok := aliases[c.Name]; ok {
c.Keywords = append(c.Keywords, kw...)
}
countries = append(countries, c)
}
return countries, nil
}
type NewsItem struct {
ID string
Titulo *string
Resumen *string
}
func fetchPendingNews(ctx context.Context, limit int) ([]NewsItem, error) {
rows, err := dbPool.Query(ctx, `
SELECT id, titulo, resumen
FROM noticias
WHERE topics_processed = FALSE
ORDER BY fecha DESC
LIMIT $1
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var items []NewsItem
for rows.Next() {
var n NewsItem
if err := rows.Scan(&n.ID, &n.Titulo, &n.Resumen); err != nil {
continue
}
items = append(items, n)
}
return items, nil
}
func findTopics(text string, topics []Topic) []struct {
TopicID int64
Score int
} {
text = strings.ToLower(text)
var matches []struct {
TopicID int64
Score int
}
for _, topic := range topics {
count := 0
for _, kw := range topic.Keywords {
if strings.Contains(text, kw) {
count++
}
}
if count > 0 {
matches = append(matches, struct {
TopicID int64
Score int
}{topic.ID, topic.Weight * count})
}
}
return matches
}
func findBestCountry(text string, countries []Country) *int64 {
text = strings.ToLower(text)
bestID := new(int64)
bestCount := 0
for _, c := range countries {
count := 0
for _, kw := range c.Keywords {
if strings.Contains(text, kw) {
count++
}
}
if count > bestCount {
bestCount = count
*bestID = c.ID
}
}
if bestCount > 0 {
return bestID
}
return nil
}
func processBatch(ctx context.Context, topics []Topic, countries []Country) (int, error) {
items, err := fetchPendingNews(ctx, batchSz)
if err != nil {
return 0, err
}
if len(items) == 0 {
return 0, nil
}
type topicMatch struct {
NoticiaID string
TopicID int64
Score int
}
type countryUpdate struct {
PaisID int64
NoticiaID string
}
var topicMatches []topicMatch
var countryUpdates []countryUpdate
var processedIDs []string
for _, item := range items {
var text string
if item.Titulo != nil {
text += *item.Titulo
}
if item.Resumen != nil {
text += " " + *item.Resumen
}
// Find topics
matches := findTopics(text, topics)
for _, m := range matches {
topicMatches = append(topicMatches, topicMatch{item.ID, m.TopicID, m.Score})
}
// Find best country
if countryID := findBestCountry(text, countries); countryID != nil {
countryUpdates = append(countryUpdates, countryUpdate{*countryID, item.ID})
}
processedIDs = append(processedIDs, item.ID)
}
// Insert topic relations
if len(topicMatches) > 0 {
for _, tm := range topicMatches {
_, err := dbPool.Exec(ctx, `
INSERT INTO news_topics (noticia_id, topic_id, score)
VALUES ($1, $2, $3)
ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
`, tm.NoticiaID, tm.TopicID, tm.Score)
if err != nil {
logger.Printf("Error inserting topic: %v", err)
}
}
}
// Update country
if len(countryUpdates) > 0 {
for _, cu := range countryUpdates {
_, err := dbPool.Exec(ctx, `
UPDATE noticias SET pais_id = $1 WHERE id = $2
`, cu.PaisID, cu.NoticiaID)
if err != nil {
logger.Printf("Error updating country: %v", err)
}
}
}
// Mark as processed
if len(processedIDs) > 0 {
_, err := dbPool.Exec(ctx, `
UPDATE noticias SET topics_processed = TRUE WHERE id = ANY($1)
`, processedIDs)
if err != nil {
return 0, err
}
}
return len(items), nil
}
func main() {
loadConfig()
logger.Println("Starting Topics Worker")
cfg := workers.LoadDBConfig()
if err := workers.Connect(cfg); err != nil {
logger.Fatalf("Failed to connect to database: %v", err)
}
dbPool = workers.GetPool()
defer workers.Close()
ctx := context.Background()
// Ensure schema
if err := ensureSchema(ctx); err != nil {
logger.Printf("Error ensuring schema: %v", err)
}
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
logger.Println("Shutting down...")
os.Exit(0)
}()
logger.Printf("Config: sleep=%ds, batch=%d", sleepSec, batchSz)
for {
select {
case <-time.After(time.Duration(sleepSec) * time.Second):
topics, err := loadTopics(ctx)
if err != nil {
logger.Printf("Error loading topics: %v", err)
continue
}
if len(topics) == 0 {
logger.Println("No topics found in DB")
time.Sleep(time.Duration(sleepSec) * time.Second)
continue
}
countries, err := loadCountries(ctx)
if err != nil {
logger.Printf("Error loading countries: %v", err)
continue
}
count, err := processBatch(ctx, topics, countries)
if err != nil {
logger.Printf("Error processing batch: %v", err)
continue
}
if count > 0 {
logger.Printf("Processed %d news items", count)
}
if count < batchSz {
time.Sleep(time.Duration(sleepSec) * time.Second)
}
}
}
}

View file

@ -0,0 +1,267 @@
package main
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"os/signal"
"path/filepath"
"strings"
"syscall"
"time"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/rss2/backend/internal/workers"
)
var (
logger *log.Logger
pool *pgxpool.Pool
sleepInterval = 30
batchSize = 50
imagesDir = "/app/data/wiki_images"
)
type WikiSummary struct {
Type string `json:"type"`
Title string `json:"title"`
DisplayTitle string `json:"displaytitle"`
Extract string `json:"extract"`
ContentUrls struct {
Desktop struct {
Page string `json:"page"`
} `json:"desktop"`
} `json:"content_urls"`
Thumbnail *struct {
Source string `json:"source"`
Width int `json:"width"`
Height int `json:"height"`
} `json:"thumbnail"`
}
type Tag struct {
ID int64
Valor string
Tipo string
}
func init() {
logger = log.New(os.Stdout, "[WIKI_WORKER] ", log.LstdFlags)
}
func getPendingTags(ctx context.Context) ([]Tag, error) {
rows, err := pool.Query(ctx, `
SELECT t.id, t.valor, t.tipo
FROM tags t
LEFT JOIN (
SELECT tag_id, COUNT(*) as cnt
FROM tags_noticia
GROUP BY tag_id
) c ON c.tag_id = t.id
WHERE t.tipo IN ('persona', 'organizacion')
AND t.wiki_checked = FALSE
ORDER BY COALESCE(c.cnt, 0) DESC, t.id DESC
LIMIT $1
`, batchSize)
if err != nil {
return nil, err
}
defer rows.Close()
var tags []Tag
for rows.Next() {
var t Tag
if err := rows.Scan(&t.ID, &t.Valor, &t.Tipo); err == nil {
tags = append(tags, t)
}
}
return tags, nil
}
func downloadImage(imgURL, destPath string) error {
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", imgURL, nil)
if err != nil {
return err
}
req.Header.Set("User-Agent", "RSS2-WikiWorker/1.0 (https://github.com/proyecto/rss2)")
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return fmt.Errorf("HTTP %d", resp.StatusCode)
}
out, err := os.Create(destPath)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, resp.Body)
return err
}
func fetchWikipediaInfo(valor string) (*WikiSummary, error) {
// Normalize the value to be wiki-compatible
title := strings.ReplaceAll(strings.TrimSpace(valor), " ", "_")
encodedTitle := url.PathEscape(title)
apiURL := fmt.Sprintf("https://es.wikipedia.org/api/rest_v1/page/summary/%s", encodedTitle)
client := &http.Client{Timeout: 10 * time.Second}
req, err := http.NewRequest("GET", apiURL, nil)
if err != nil {
return nil, err
}
// Per MediaWiki API policy: https://meta.wikimedia.org/wiki/User-Agent_policy
req.Header.Set("User-Agent", "RSS2-WikiWorker/1.0 (pietrelinux@gmail.com)")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == 429 {
return nil, fmt.Errorf("HTTP 429: Too Many Requests (Rate Limited)")
}
if resp.StatusCode == 404 {
return nil, nil // Not found, but handled successfully without error
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
}
var summary WikiSummary
if err := json.NewDecoder(resp.Body).Decode(&summary); err != nil {
return nil, err
}
// Filter out disambiguation pages
if summary.Type == "disambiguation" {
return nil, nil // Treat as not found to strictly avoid incorrect tooltips
}
return &summary, nil
}
func processTag(ctx context.Context, tag Tag) {
logger.Printf("Procesando tag %d: %s", tag.ID, tag.Valor)
summary, err := fetchWikipediaInfo(tag.Valor)
if err != nil {
logger.Printf("Error al consultar Wikipedia para %s: %v", tag.Valor, err)
return
}
if summary == nil || summary.Extract == "" {
// Not found or disambiguation
_, _ = pool.Exec(ctx, "UPDATE tags SET wiki_checked = TRUE WHERE id = $1", tag.ID)
logger.Printf("No se encontraron resultados válidos en Wikipedia para: %s", tag.Valor)
return
}
var localImagePath *string
if summary.Thumbnail != nil && summary.Thumbnail.Source != "" {
ext := ".jpg"
if strings.HasSuffix(strings.ToLower(summary.Thumbnail.Source), ".png") {
ext = ".png"
}
fileName := fmt.Sprintf("wiki_%d%s", tag.ID, ext)
destPath := filepath.Join(imagesDir, fileName)
if err := downloadImage(summary.Thumbnail.Source, destPath); err != nil {
logger.Printf("Error descargando imagen para %s: %v", tag.Valor, err)
// Guardaremos la URL externa como fallback si falla la descarga
src := summary.Thumbnail.Source
localImagePath = &src
} else {
relativePath := "/api/wiki-images/" + fileName
localImagePath = &relativePath
}
}
wikiURL := summary.ContentUrls.Desktop.Page
_, err = pool.Exec(ctx, `
UPDATE tags
SET wiki_summary = $1,
wiki_url = $2,
image_path = $3,
wiki_checked = TRUE
WHERE id = $4
`, summary.Extract, wikiURL, localImagePath, tag.ID)
if err != nil {
logger.Printf("Error al actualizar la base de datos para tag %d: %v", tag.ID, err)
} else {
logger.Printf("Actualizado con éxito: %s (Imagen: %v)", tag.Valor, localImagePath != nil)
}
}
func main() {
if val := os.Getenv("WIKI_SLEEP"); val != "" {
if sleep, err := fmt.Sscanf(val, "%d", &sleepInterval); err == nil && sleep > 0 {
sleepInterval = sleep
}
}
logger.Println("Iniciando Wiki Worker...")
if err := os.MkdirAll(imagesDir, 0755); err != nil {
logger.Fatalf("Error creando directorio de imágenes: %v", err)
}
cfg := workers.LoadDBConfig()
if err := workers.Connect(cfg); err != nil {
logger.Fatalf("Failed to connect to database: %v", err)
}
pool = workers.GetPool()
defer workers.Close()
ctx := context.Background()
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
logger.Println("Cerrando gracefully...")
workers.Close()
os.Exit(0)
}()
logger.Printf("Configuración: sleep=%ds, batch=%d", sleepInterval, batchSize)
for {
tags, err := getPendingTags(ctx)
if err != nil {
logger.Printf("Error recuperando tags pendientes: %v", err)
time.Sleep(10 * time.Second)
continue
}
if len(tags) == 0 {
logger.Printf("No hay tags pendientes. Durmiendo %d segundos...", sleepInterval)
time.Sleep(time.Duration(sleepInterval) * time.Second)
continue
}
logger.Printf("Recuperados %d tags para procesar...", len(tags))
for _, tag := range tags {
processTag(ctx, tag)
time.Sleep(3 * time.Second) // Increased delay to avoid Wikipedia Rate Limits (429)
}
}
}