rss2/backend/cmd/wiki_worker/main.go

267 lines
6.4 KiB
Go

package main
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"os/signal"
"path/filepath"
"strings"
"syscall"
"time"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/rss2/backend/internal/workers"
)
var (
logger *log.Logger
pool *pgxpool.Pool
sleepInterval = 30
batchSize = 50
imagesDir = "/app/data/wiki_images"
)
type WikiSummary struct {
Type string `json:"type"`
Title string `json:"title"`
DisplayTitle string `json:"displaytitle"`
Extract string `json:"extract"`
ContentUrls struct {
Desktop struct {
Page string `json:"page"`
} `json:"desktop"`
} `json:"content_urls"`
Thumbnail *struct {
Source string `json:"source"`
Width int `json:"width"`
Height int `json:"height"`
} `json:"thumbnail"`
}
type Tag struct {
ID int64
Valor string
Tipo string
}
func init() {
logger = log.New(os.Stdout, "[WIKI_WORKER] ", log.LstdFlags)
}
func getPendingTags(ctx context.Context) ([]Tag, error) {
rows, err := pool.Query(ctx, `
SELECT t.id, t.valor, t.tipo
FROM tags t
LEFT JOIN (
SELECT tag_id, COUNT(*) as cnt
FROM tags_noticia
GROUP BY tag_id
) c ON c.tag_id = t.id
WHERE t.tipo IN ('persona', 'organizacion')
AND t.wiki_checked = FALSE
ORDER BY COALESCE(c.cnt, 0) DESC, t.id DESC
LIMIT $1
`, batchSize)
if err != nil {
return nil, err
}
defer rows.Close()
var tags []Tag
for rows.Next() {
var t Tag
if err := rows.Scan(&t.ID, &t.Valor, &t.Tipo); err == nil {
tags = append(tags, t)
}
}
return tags, nil
}
func downloadImage(imgURL, destPath string) error {
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", imgURL, nil)
if err != nil {
return err
}
req.Header.Set("User-Agent", "RSS2-WikiWorker/1.0 (https://github.com/proyecto/rss2)")
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return fmt.Errorf("HTTP %d", resp.StatusCode)
}
out, err := os.Create(destPath)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, resp.Body)
return err
}
func fetchWikipediaInfo(valor string) (*WikiSummary, error) {
// Normalize the value to be wiki-compatible
title := strings.ReplaceAll(strings.TrimSpace(valor), " ", "_")
encodedTitle := url.PathEscape(title)
apiURL := fmt.Sprintf("https://es.wikipedia.org/api/rest_v1/page/summary/%s", encodedTitle)
client := &http.Client{Timeout: 10 * time.Second}
req, err := http.NewRequest("GET", apiURL, nil)
if err != nil {
return nil, err
}
// Per MediaWiki API policy: https://meta.wikimedia.org/wiki/User-Agent_policy
req.Header.Set("User-Agent", "RSS2-WikiWorker/1.0 (pietrelinux@gmail.com)")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == 429 {
return nil, fmt.Errorf("HTTP 429: Too Many Requests (Rate Limited)")
}
if resp.StatusCode == 404 {
return nil, nil // Not found, but handled successfully without error
}
if resp.StatusCode != 200 {
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
}
var summary WikiSummary
if err := json.NewDecoder(resp.Body).Decode(&summary); err != nil {
return nil, err
}
// Filter out disambiguation pages
if summary.Type == "disambiguation" {
return nil, nil // Treat as not found to strictly avoid incorrect tooltips
}
return &summary, nil
}
func processTag(ctx context.Context, tag Tag) {
logger.Printf("Procesando tag %d: %s", tag.ID, tag.Valor)
summary, err := fetchWikipediaInfo(tag.Valor)
if err != nil {
logger.Printf("Error al consultar Wikipedia para %s: %v", tag.Valor, err)
return
}
if summary == nil || summary.Extract == "" {
// Not found or disambiguation
_, _ = pool.Exec(ctx, "UPDATE tags SET wiki_checked = TRUE WHERE id = $1", tag.ID)
logger.Printf("No se encontraron resultados válidos en Wikipedia para: %s", tag.Valor)
return
}
var localImagePath *string
if summary.Thumbnail != nil && summary.Thumbnail.Source != "" {
ext := ".jpg"
if strings.HasSuffix(strings.ToLower(summary.Thumbnail.Source), ".png") {
ext = ".png"
}
fileName := fmt.Sprintf("wiki_%d%s", tag.ID, ext)
destPath := filepath.Join(imagesDir, fileName)
if err := downloadImage(summary.Thumbnail.Source, destPath); err != nil {
logger.Printf("Error descargando imagen para %s: %v", tag.Valor, err)
// Guardaremos la URL externa como fallback si falla la descarga
src := summary.Thumbnail.Source
localImagePath = &src
} else {
relativePath := "/api/wiki-images/" + fileName
localImagePath = &relativePath
}
}
wikiURL := summary.ContentUrls.Desktop.Page
_, err = pool.Exec(ctx, `
UPDATE tags
SET wiki_summary = $1,
wiki_url = $2,
image_path = $3,
wiki_checked = TRUE
WHERE id = $4
`, summary.Extract, wikiURL, localImagePath, tag.ID)
if err != nil {
logger.Printf("Error al actualizar la base de datos para tag %d: %v", tag.ID, err)
} else {
logger.Printf("Actualizado con éxito: %s (Imagen: %v)", tag.Valor, localImagePath != nil)
}
}
func main() {
if val := os.Getenv("WIKI_SLEEP"); val != "" {
if sleep, err := fmt.Sscanf(val, "%d", &sleepInterval); err == nil && sleep > 0 {
sleepInterval = sleep
}
}
logger.Println("Iniciando Wiki Worker...")
if err := os.MkdirAll(imagesDir, 0755); err != nil {
logger.Fatalf("Error creando directorio de imágenes: %v", err)
}
cfg := workers.LoadDBConfig()
if err := workers.Connect(cfg); err != nil {
logger.Fatalf("Failed to connect to database: %v", err)
}
pool = workers.GetPool()
defer workers.Close()
ctx := context.Background()
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
logger.Println("Cerrando gracefully...")
workers.Close()
os.Exit(0)
}()
logger.Printf("Configuración: sleep=%ds, batch=%d", sleepInterval, batchSize)
for {
tags, err := getPendingTags(ctx)
if err != nil {
logger.Printf("Error recuperando tags pendientes: %v", err)
time.Sleep(10 * time.Second)
continue
}
if len(tags) == 0 {
logger.Printf("No hay tags pendientes. Durmiendo %d segundos...", sleepInterval)
time.Sleep(time.Duration(sleepInterval) * time.Second)
continue
}
logger.Printf("Recuperados %d tags para procesar...", len(tags))
for _, tag := range tags {
processTag(ctx, tag)
time.Sleep(3 * time.Second) // Increased delay to avoid Wikipedia Rate Limits (429)
}
}
}