267 lines
6.4 KiB
Go
267 lines
6.4 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"os/signal"
|
|
"path/filepath"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5/pgxpool"
|
|
"github.com/rss2/backend/internal/workers"
|
|
)
|
|
|
|
var (
|
|
logger *log.Logger
|
|
pool *pgxpool.Pool
|
|
sleepInterval = 30
|
|
batchSize = 50
|
|
imagesDir = "/app/data/wiki_images"
|
|
)
|
|
|
|
type WikiSummary struct {
|
|
Type string `json:"type"`
|
|
Title string `json:"title"`
|
|
DisplayTitle string `json:"displaytitle"`
|
|
Extract string `json:"extract"`
|
|
ContentUrls struct {
|
|
Desktop struct {
|
|
Page string `json:"page"`
|
|
} `json:"desktop"`
|
|
} `json:"content_urls"`
|
|
Thumbnail *struct {
|
|
Source string `json:"source"`
|
|
Width int `json:"width"`
|
|
Height int `json:"height"`
|
|
} `json:"thumbnail"`
|
|
}
|
|
|
|
type Tag struct {
|
|
ID int64
|
|
Valor string
|
|
Tipo string
|
|
}
|
|
|
|
func init() {
|
|
logger = log.New(os.Stdout, "[WIKI_WORKER] ", log.LstdFlags)
|
|
}
|
|
|
|
func getPendingTags(ctx context.Context) ([]Tag, error) {
|
|
rows, err := pool.Query(ctx, `
|
|
SELECT t.id, t.valor, t.tipo
|
|
FROM tags t
|
|
LEFT JOIN (
|
|
SELECT tag_id, COUNT(*) as cnt
|
|
FROM tags_noticia
|
|
GROUP BY tag_id
|
|
) c ON c.tag_id = t.id
|
|
WHERE t.tipo IN ('persona', 'organizacion')
|
|
AND t.wiki_checked = FALSE
|
|
ORDER BY COALESCE(c.cnt, 0) DESC, t.id DESC
|
|
LIMIT $1
|
|
`, batchSize)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
var tags []Tag
|
|
for rows.Next() {
|
|
var t Tag
|
|
if err := rows.Scan(&t.ID, &t.Valor, &t.Tipo); err == nil {
|
|
tags = append(tags, t)
|
|
}
|
|
}
|
|
return tags, nil
|
|
}
|
|
|
|
func downloadImage(imgURL, destPath string) error {
|
|
client := &http.Client{Timeout: 15 * time.Second}
|
|
req, err := http.NewRequest("GET", imgURL, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
req.Header.Set("User-Agent", "RSS2-WikiWorker/1.0 (https://github.com/proyecto/rss2)")
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
return fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
out, err := os.Create(destPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer out.Close()
|
|
|
|
_, err = io.Copy(out, resp.Body)
|
|
return err
|
|
}
|
|
|
|
func fetchWikipediaInfo(valor string) (*WikiSummary, error) {
|
|
// Normalize the value to be wiki-compatible
|
|
title := strings.ReplaceAll(strings.TrimSpace(valor), " ", "_")
|
|
encodedTitle := url.PathEscape(title)
|
|
|
|
apiURL := fmt.Sprintf("https://es.wikipedia.org/api/rest_v1/page/summary/%s", encodedTitle)
|
|
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
req, err := http.NewRequest("GET", apiURL, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// Per MediaWiki API policy: https://meta.wikimedia.org/wiki/User-Agent_policy
|
|
req.Header.Set("User-Agent", "RSS2-WikiWorker/1.0 (pietrelinux@gmail.com)")
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode == 429 {
|
|
return nil, fmt.Errorf("HTTP 429: Too Many Requests (Rate Limited)")
|
|
}
|
|
if resp.StatusCode == 404 {
|
|
return nil, nil // Not found, but handled successfully without error
|
|
}
|
|
if resp.StatusCode != 200 {
|
|
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
var summary WikiSummary
|
|
if err := json.NewDecoder(resp.Body).Decode(&summary); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Filter out disambiguation pages
|
|
if summary.Type == "disambiguation" {
|
|
return nil, nil // Treat as not found to strictly avoid incorrect tooltips
|
|
}
|
|
|
|
return &summary, nil
|
|
}
|
|
|
|
func processTag(ctx context.Context, tag Tag) {
|
|
logger.Printf("Procesando tag %d: %s", tag.ID, tag.Valor)
|
|
|
|
summary, err := fetchWikipediaInfo(tag.Valor)
|
|
if err != nil {
|
|
logger.Printf("Error al consultar Wikipedia para %s: %v", tag.Valor, err)
|
|
return
|
|
}
|
|
|
|
if summary == nil || summary.Extract == "" {
|
|
// Not found or disambiguation
|
|
_, _ = pool.Exec(ctx, "UPDATE tags SET wiki_checked = TRUE WHERE id = $1", tag.ID)
|
|
logger.Printf("No se encontraron resultados válidos en Wikipedia para: %s", tag.Valor)
|
|
return
|
|
}
|
|
|
|
var localImagePath *string
|
|
if summary.Thumbnail != nil && summary.Thumbnail.Source != "" {
|
|
ext := ".jpg"
|
|
if strings.HasSuffix(strings.ToLower(summary.Thumbnail.Source), ".png") {
|
|
ext = ".png"
|
|
}
|
|
fileName := fmt.Sprintf("wiki_%d%s", tag.ID, ext)
|
|
destPath := filepath.Join(imagesDir, fileName)
|
|
|
|
if err := downloadImage(summary.Thumbnail.Source, destPath); err != nil {
|
|
logger.Printf("Error descargando imagen para %s: %v", tag.Valor, err)
|
|
// Guardaremos la URL externa como fallback si falla la descarga
|
|
src := summary.Thumbnail.Source
|
|
localImagePath = &src
|
|
} else {
|
|
relativePath := "/api/wiki-images/" + fileName
|
|
localImagePath = &relativePath
|
|
}
|
|
}
|
|
|
|
wikiURL := summary.ContentUrls.Desktop.Page
|
|
|
|
_, err = pool.Exec(ctx, `
|
|
UPDATE tags
|
|
SET wiki_summary = $1,
|
|
wiki_url = $2,
|
|
image_path = $3,
|
|
wiki_checked = TRUE
|
|
WHERE id = $4
|
|
`, summary.Extract, wikiURL, localImagePath, tag.ID)
|
|
|
|
if err != nil {
|
|
logger.Printf("Error al actualizar la base de datos para tag %d: %v", tag.ID, err)
|
|
} else {
|
|
logger.Printf("Actualizado con éxito: %s (Imagen: %v)", tag.Valor, localImagePath != nil)
|
|
}
|
|
}
|
|
|
|
func main() {
|
|
if val := os.Getenv("WIKI_SLEEP"); val != "" {
|
|
if sleep, err := fmt.Sscanf(val, "%d", &sleepInterval); err == nil && sleep > 0 {
|
|
sleepInterval = sleep
|
|
}
|
|
}
|
|
|
|
logger.Println("Iniciando Wiki Worker...")
|
|
|
|
if err := os.MkdirAll(imagesDir, 0755); err != nil {
|
|
logger.Fatalf("Error creando directorio de imágenes: %v", err)
|
|
}
|
|
|
|
cfg := workers.LoadDBConfig()
|
|
if err := workers.Connect(cfg); err != nil {
|
|
logger.Fatalf("Failed to connect to database: %v", err)
|
|
}
|
|
pool = workers.GetPool()
|
|
defer workers.Close()
|
|
|
|
ctx := context.Background()
|
|
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
go func() {
|
|
<-sigChan
|
|
logger.Println("Cerrando gracefully...")
|
|
workers.Close()
|
|
os.Exit(0)
|
|
}()
|
|
|
|
logger.Printf("Configuración: sleep=%ds, batch=%d", sleepInterval, batchSize)
|
|
|
|
for {
|
|
tags, err := getPendingTags(ctx)
|
|
if err != nil {
|
|
logger.Printf("Error recuperando tags pendientes: %v", err)
|
|
time.Sleep(10 * time.Second)
|
|
continue
|
|
}
|
|
|
|
if len(tags) == 0 {
|
|
logger.Printf("No hay tags pendientes. Durmiendo %d segundos...", sleepInterval)
|
|
time.Sleep(time.Duration(sleepInterval) * time.Second)
|
|
continue
|
|
}
|
|
|
|
logger.Printf("Recuperados %d tags para procesar...", len(tags))
|
|
|
|
for _, tag := range tags {
|
|
processTag(ctx, tag)
|
|
time.Sleep(3 * time.Second) // Increased delay to avoid Wikipedia Rate Limits (429)
|
|
}
|
|
}
|
|
}
|