go integration and wikipedia
This commit is contained in:
parent
47a252e339
commit
ee90335b92
7828 changed files with 1307913 additions and 20807 deletions
468
backend/cmd/discovery/main.go
Normal file
468
backend/cmd/discovery/main.go
Normal file
|
|
@ -0,0 +1,468 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/mmcdole/gofeed"
|
||||
"github.com/rss2/backend/internal/workers"
|
||||
)
|
||||
|
||||
var (
|
||||
logger *log.Logger
|
||||
pool *workers.Config
|
||||
dbPool *pgxpool.Pool
|
||||
sleepSec = 900 // 15 minutes
|
||||
batchSize = 10
|
||||
)
|
||||
|
||||
type URLSource struct {
|
||||
ID int64
|
||||
Nombre string
|
||||
URL string
|
||||
CategoriaID *int64
|
||||
PaisID *int64
|
||||
Idioma *string
|
||||
}
|
||||
|
||||
func init() {
|
||||
logger = log.New(os.Stdout, "[DISCOVERY] ", log.LstdFlags)
|
||||
}
|
||||
|
||||
func loadConfig() {
|
||||
sleepSec = getEnvInt("DISCOVERY_INTERVAL", 900)
|
||||
batchSize = getEnvInt("DISCOVERY_BATCH", 10)
|
||||
}
|
||||
|
||||
func getEnvInt(key string, defaultValue int) int {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
func getPendingURLs(ctx context.Context) ([]URLSource, error) {
|
||||
rows, err := dbPool.Query(ctx, `
|
||||
SELECT id, nombre, url, categoria_id, pais_id, idioma
|
||||
FROM fuentes_url
|
||||
WHERE active = TRUE
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN last_check IS NULL THEN 1
|
||||
WHEN last_status = 'error' THEN 2
|
||||
WHEN last_status = 'no_feeds' THEN 3
|
||||
ELSE 4
|
||||
END,
|
||||
last_check ASC NULLS FIRST
|
||||
LIMIT $1
|
||||
`, batchSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var sources []URLSource
|
||||
for rows.Next() {
|
||||
var s URLSource
|
||||
if err := rows.Scan(&s.ID, &s.Nombre, &s.URL, &s.CategoriaID, &s.PaisID, &s.Idioma); err != nil {
|
||||
continue
|
||||
}
|
||||
sources = append(sources, s)
|
||||
}
|
||||
return sources, nil
|
||||
}
|
||||
|
||||
func updateURLStatus(ctx context.Context, urlID int64, status, message string, httpCode int) error {
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
UPDATE fuentes_url
|
||||
SET last_check = NOW(),
|
||||
last_status = $1,
|
||||
status_message = $2,
|
||||
last_http_code = $3
|
||||
WHERE id = $4
|
||||
`, status, message, httpCode, urlID)
|
||||
return err
|
||||
}
|
||||
|
||||
func discoverFeeds(pageURL string) ([]string, error) {
|
||||
client := &http.Client{
|
||||
Timeout: 15 * time.Second,
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", pageURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; RSS2Bot/1.0)")
|
||||
req.Header.Set("Accept", "application/rss+xml, application/atom+xml, application/xml, text/xml, text/html")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Try to parse as feed first
|
||||
parser := gofeed.NewParser()
|
||||
feed, err := parser.Parse(resp.Body)
|
||||
if err == nil && feed != nil && len(feed.Items) > 0 {
|
||||
// It's a valid feed
|
||||
return []string{pageURL}, nil
|
||||
}
|
||||
|
||||
// If not a feed, try to find feeds in HTML
|
||||
return findFeedLinksInHTML(pageURL)
|
||||
}
|
||||
|
||||
func findFeedLinksInHTML(baseURL string) ([]string, error) {
|
||||
// Simple feed link finder - returns empty for now
|
||||
// In production, use goquery to parse HTML and find RSS/Atom links
|
||||
return []string{}, nil
|
||||
}
|
||||
|
||||
func parseFeed(feedURL string) (*gofeed.Feed, error) {
|
||||
client := &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", feedURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; RSS2Bot/1.0)")
|
||||
req.Header.Set("Accept", "application/rss+xml, application/atom+xml, application/xml, text/xml")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
parser := gofeed.NewParser()
|
||||
return parser.Parse(resp.Body)
|
||||
}
|
||||
|
||||
func getFeedMetadata(feedURL string) (title, description, language string, entryCount int, err error) {
|
||||
feed, err := parseFeed(feedURL)
|
||||
if err != nil {
|
||||
return "", "", "", 0, err
|
||||
}
|
||||
|
||||
title = feed.Title
|
||||
if title == "" {
|
||||
title = "Feed sin título"
|
||||
}
|
||||
|
||||
description = feed.Description
|
||||
if len(description) > 500 {
|
||||
description = description[:500]
|
||||
}
|
||||
|
||||
language = feed.Language
|
||||
entryCount = len(feed.Items)
|
||||
|
||||
return title, description, language, entryCount, nil
|
||||
}
|
||||
|
||||
func analyzeFeed(title, url, description string) (country, category string) {
|
||||
// Simple heuristics - in production use ML or API
|
||||
lowerTitle := strings.ToLower(title)
|
||||
lowerDesc := strings.ToLower(description)
|
||||
combined := lowerTitle + " " + lowerDesc
|
||||
|
||||
// Detect country
|
||||
countries := map[string][]string{
|
||||
"España": {"españa", "español", "madrid", "barcelona"},
|
||||
"Argentina": {"argentino", "buenos aires"},
|
||||
"México": {"méxico", "mexicano", "cdmx", "ciudad de méxico"},
|
||||
"Colombia": {"colombiano", "bogotá"},
|
||||
"Chile": {"chileno", "santiago"},
|
||||
"Perú": {"peruano", "lima"},
|
||||
"EE.UU.": {"estados unidos", "washington", "trump", "biden"},
|
||||
"Reino Unido": {"reino unido", "londres", "uk"},
|
||||
"Francia": {"francia", "parís"},
|
||||
"Alemania": {"alemania", "berlín"},
|
||||
}
|
||||
|
||||
for country, keywords := range countries {
|
||||
for _, kw := range keywords {
|
||||
if strings.Contains(combined, kw) {
|
||||
return country, ""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return "", ""
|
||||
}
|
||||
|
||||
func getCountryIDByName(ctx context.Context, countryName string) (*int64, error) {
|
||||
var id int64
|
||||
err := dbPool.QueryRow(ctx, "SELECT id FROM paises WHERE LOWER(nombre) = LOWER($1)", countryName).Scan(&id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &id, nil
|
||||
}
|
||||
|
||||
func getCategoryIDByName(ctx context.Context, categoryName string) (*int64, error) {
|
||||
var id int64
|
||||
err := dbPool.QueryRow(ctx, "SELECT id FROM categorias WHERE LOWER(nombre) = LOWER($1)", categoryName).Scan(&id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &id, nil
|
||||
}
|
||||
|
||||
func createPendingFeed(ctx context.Context, fuenteURLID int64, feedURL string, metadata map[string]interface{}) error {
|
||||
feedTitle := metadata["title"].(string)
|
||||
if feedTitle == "" {
|
||||
feedTitle = "Feed sin título"
|
||||
}
|
||||
|
||||
description := ""
|
||||
if d, ok := metadata["description"].(string); ok {
|
||||
description = d
|
||||
}
|
||||
|
||||
language := ""
|
||||
if l, ok := metadata["language"].(string); ok {
|
||||
language = l
|
||||
}
|
||||
|
||||
entryCount := 0
|
||||
if c, ok := metadata["entry_count"].(int); ok {
|
||||
entryCount = c
|
||||
}
|
||||
|
||||
detectedCountry := ""
|
||||
if dc, ok := metadata["detected_country"].(string); ok {
|
||||
detectedCountry = dc
|
||||
}
|
||||
|
||||
var detectedCountryID *int64
|
||||
if detectedCountry != "" {
|
||||
if cid, err := getCountryIDByName(ctx, detectedCountry); err == nil {
|
||||
detectedCountryID = cid
|
||||
}
|
||||
}
|
||||
|
||||
suggestedCategory := ""
|
||||
if sc, ok := metadata["suggested_category"].(string); ok {
|
||||
suggestedCategory = sc
|
||||
}
|
||||
|
||||
var suggestedCategoryID *int64
|
||||
if suggestedCategory != "" {
|
||||
if caid, err := getCategoryIDByName(ctx, suggestedCategory); err == nil {
|
||||
suggestedCategoryID = caid
|
||||
}
|
||||
}
|
||||
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
INSERT INTO feeds_pending (
|
||||
fuente_url_id, feed_url, feed_title, feed_description,
|
||||
feed_language, feed_type, entry_count,
|
||||
detected_country_id, suggested_categoria_id,
|
||||
discovered_at
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, 'rss', $6, $7, $8, NOW())
|
||||
ON CONFLICT (feed_url) DO UPDATE
|
||||
SET feed_title = EXCLUDED.feed_title,
|
||||
discovered_at = NOW()
|
||||
`, fuenteURLID, feedURL, feedTitle, description, language, entryCount, detectedCountryID, suggestedCategoryID)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func createFeedDirectly(ctx context.Context, feedURL string, fuenteURLID *int64, categoriaID, paisID *int64, idioma *string) (bool, error) {
|
||||
title, description, language, _, err := getFeedMetadata(feedURL)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if language == "" && idioma != nil {
|
||||
language = *idioma
|
||||
}
|
||||
|
||||
var feedID int64
|
||||
err = dbPool.QueryRow(ctx, `
|
||||
INSERT INTO feeds (nombre, descripcion, url, categoria_id, pais_id, idioma, fuente_url_id, activo)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, TRUE)
|
||||
ON CONFLICT (url) DO NOTHING
|
||||
RETURNING id
|
||||
`, title, description, feedURL, categoriaID, paisID, language, fuenteURLID).Scan(&feedID)
|
||||
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
return feedID > 0, nil
|
||||
}
|
||||
|
||||
func processURLSource(ctx context.Context, source URLSource) {
|
||||
logger.Printf("Processing: %s (%s)", source.Nombre, source.URL)
|
||||
|
||||
// Try to find feeds on this URL
|
||||
feeds, err := discoverFeeds(source.URL)
|
||||
if err != nil {
|
||||
logger.Printf("Error discovering feeds: %v", err)
|
||||
updateURLStatus(ctx, source.ID, "error", err.Error()[:200], 0)
|
||||
return
|
||||
}
|
||||
|
||||
if len(feeds) == 0 {
|
||||
logger.Printf("No feeds found for: %s", source.URL)
|
||||
updateURLStatus(ctx, source.ID, "no_feeds", "No feeds found", 200)
|
||||
return
|
||||
}
|
||||
|
||||
logger.Printf("Found %d feeds for %s", len(feeds), source.URL)
|
||||
|
||||
maxFeeds := getEnvInt("MAX_FEEDS_PER_URL", 5)
|
||||
if len(feeds) > maxFeeds {
|
||||
feeds = feeds[:maxFeeds]
|
||||
}
|
||||
|
||||
autoApprove := source.CategoriaID != nil && source.PaisID != nil
|
||||
|
||||
created := 0
|
||||
pending := 0
|
||||
existing := 0
|
||||
errors := 0
|
||||
|
||||
for _, feedURL := range feeds {
|
||||
// Get feed metadata
|
||||
title, description, language, entryCount, err := getFeedMetadata(feedURL)
|
||||
if err != nil {
|
||||
logger.Printf("Error parsing feed %s: %v", feedURL, err)
|
||||
errors++
|
||||
continue
|
||||
}
|
||||
|
||||
// Analyze for country/category
|
||||
detectedCountry, suggestedCategory := analyzeFeed(title, feedURL, description)
|
||||
|
||||
metadata := map[string]interface{}{
|
||||
"title": title,
|
||||
"description": description,
|
||||
"language": language,
|
||||
"entry_count": entryCount,
|
||||
"detected_country": detectedCountry,
|
||||
"suggested_category": suggestedCategory,
|
||||
}
|
||||
|
||||
if !autoApprove {
|
||||
// Create pending feed for review
|
||||
if err := createPendingFeed(ctx, source.ID, feedURL, metadata); err != nil {
|
||||
logger.Printf("Error creating pending feed: %v", err)
|
||||
errors++
|
||||
} else {
|
||||
pending++
|
||||
}
|
||||
} else {
|
||||
// Create feed directly
|
||||
createdFeed, err := createFeedDirectly(ctx, feedURL, &source.ID, source.CategoriaID, source.PaisID, source.Idioma)
|
||||
if err != nil {
|
||||
logger.Printf("Error creating feed: %v", err)
|
||||
errors++
|
||||
} else if createdFeed {
|
||||
created++
|
||||
} else {
|
||||
existing++
|
||||
}
|
||||
}
|
||||
|
||||
time.Sleep(1 * time.Second) // Rate limiting
|
||||
}
|
||||
|
||||
// Update status
|
||||
var status string
|
||||
var message string
|
||||
if created > 0 || pending > 0 {
|
||||
status = "success"
|
||||
parts := []string{}
|
||||
if created > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%d created", created))
|
||||
}
|
||||
if pending > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%d pending", pending))
|
||||
}
|
||||
message = strings.Join(parts, ", ")
|
||||
} else if existing > 0 {
|
||||
status = "existing"
|
||||
message = fmt.Sprintf("%d already existed", existing)
|
||||
} else {
|
||||
status = "error"
|
||||
message = fmt.Sprintf("%d errors", errors)
|
||||
}
|
||||
|
||||
updateURLStatus(ctx, source.ID, status, message, 200)
|
||||
logger.Printf("Processed %s: created=%d, pending=%d, existing=%d, errors=%d",
|
||||
source.URL, created, pending, existing, errors)
|
||||
}
|
||||
|
||||
func main() {
|
||||
loadConfig()
|
||||
logger.Println("Starting RSS Discovery Worker")
|
||||
|
||||
cfg := workers.LoadDBConfig()
|
||||
if err := workers.Connect(cfg); err != nil {
|
||||
logger.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
dbPool = workers.GetPool()
|
||||
defer workers.Close()
|
||||
|
||||
logger.Println("Connected to PostgreSQL")
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
logger.Println("Shutting down...")
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
logger.Printf("Config: interval=%ds, batch=%d", sleepSec, batchSize)
|
||||
|
||||
ticker := time.NewTicker(time.Duration(sleepSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
sources, err := getPendingURLs(ctx)
|
||||
if err != nil {
|
||||
logger.Printf("Error fetching URLs: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if len(sources) == 0 {
|
||||
logger.Println("No pending URLs to process")
|
||||
continue
|
||||
}
|
||||
|
||||
logger.Printf("Processing %d sources", len(sources))
|
||||
|
||||
for _, source := range sources {
|
||||
processURLSource(ctx, source)
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
391
backend/cmd/qdrant/main.go
Normal file
391
backend/cmd/qdrant/main.go
Normal file
|
|
@ -0,0 +1,391 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/rss2/backend/internal/workers"
|
||||
)
|
||||
|
||||
var (
|
||||
logger *log.Logger
|
||||
dbPool *pgxpool.Pool
|
||||
qdrantURL string
|
||||
ollamaURL string
|
||||
collection = "news_vectors"
|
||||
sleepSec = 30
|
||||
batchSize = 100
|
||||
)
|
||||
|
||||
func init() {
|
||||
logger = log.New(os.Stdout, "[QDRANT] ", log.LstdFlags)
|
||||
}
|
||||
|
||||
func loadConfig() {
|
||||
sleepSec = getEnvInt("QDRANT_SLEEP", 30)
|
||||
batchSize = getEnvInt("QDRANT_BATCH", 100)
|
||||
qdrantHost := getEnv("QDRANT_HOST", "localhost")
|
||||
qdrantPort := getEnvInt("QDRANT_PORT", 6333)
|
||||
qdrantURL = fmt.Sprintf("http://%s:%d", qdrantHost, qdrantPort)
|
||||
ollamaURL = getEnv("OLLAMA_URL", "http://ollama:11434")
|
||||
collection = getEnv("QDRANT_COLLECTION", "news_vectors")
|
||||
}
|
||||
|
||||
func getEnv(key, defaultValue string) string {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
return value
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
func getEnvInt(key string, defaultValue int) int {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
type Translation struct {
|
||||
ID int64
|
||||
NoticiaID int64
|
||||
Lang string
|
||||
Titulo string
|
||||
Resumen string
|
||||
URL string
|
||||
Fecha *time.Time
|
||||
FuenteNombre string
|
||||
CategoriaID *int64
|
||||
PaisID *int64
|
||||
}
|
||||
|
||||
func getPendingTranslations(ctx context.Context) ([]Translation, error) {
|
||||
rows, err := dbPool.Query(ctx, `
|
||||
SELECT
|
||||
t.id as traduccion_id,
|
||||
t.noticia_id,
|
||||
TRIM(t.lang_to) as lang,
|
||||
t.titulo_trad as titulo,
|
||||
t.resumen_trad as resumen,
|
||||
n.url,
|
||||
n.fecha,
|
||||
n.fuente_nombre,
|
||||
n.categoria_id,
|
||||
n.pais_id
|
||||
FROM traducciones t
|
||||
INNER JOIN noticias n ON t.noticia_id = n.id
|
||||
WHERE t.vectorized = FALSE
|
||||
AND t.status = 'done'
|
||||
ORDER BY t.created_at ASC
|
||||
LIMIT $1
|
||||
`, batchSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var translations []Translation
|
||||
for rows.Next() {
|
||||
var t Translation
|
||||
if err := rows.Scan(
|
||||
&t.ID, &t.NoticiaID, &t.Lang, &t.Titulo, &t.Resumen,
|
||||
&t.URL, &t.Fecha, &t.FuenteNombre, &t.CategoriaID, &t.PaisID,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
translations = append(translations, t)
|
||||
}
|
||||
return translations, nil
|
||||
}
|
||||
|
||||
type EmbeddingRequest struct {
|
||||
Model string `json:"model"`
|
||||
Input string `json:"input"`
|
||||
}
|
||||
|
||||
type EmbeddingResponse struct {
|
||||
Embedding []float64 `json:"embedding"`
|
||||
}
|
||||
|
||||
func generateEmbedding(text string) ([]float64, error) {
|
||||
reqBody := EmbeddingRequest{
|
||||
Model: "mxbai-embed-large",
|
||||
Input: text,
|
||||
}
|
||||
|
||||
body, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 60 * time.Second}
|
||||
resp, err := client.Post(ollamaURL+"/api/embeddings", "application/json", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("Ollama returned status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var result EmbeddingResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return result.Embedding, nil
|
||||
}
|
||||
|
||||
type QdrantPoint struct {
|
||||
ID interface{} `json:"id"`
|
||||
Vector []float64 `json:"vector"`
|
||||
Payload map[string]interface{} `json:"payload"`
|
||||
}
|
||||
|
||||
type QdrantUpsertRequest struct {
|
||||
Points []QdrantPoint `json:"points"`
|
||||
}
|
||||
|
||||
func ensureCollection() error {
|
||||
req, err := http.NewRequest("GET", qdrantURL+"/collections/"+collection, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == 200 {
|
||||
logger.Printf("Collection %s already exists", collection)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get embedding dimension
|
||||
emb, err := generateEmbedding("test")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get embedding dimension: %w", err)
|
||||
}
|
||||
dimension := len(emb)
|
||||
|
||||
// Create collection
|
||||
createReq := map[string]interface{}{
|
||||
"name": collection,
|
||||
"vectors": map[string]interface{}{
|
||||
"size": dimension,
|
||||
"distance": "Cosine",
|
||||
},
|
||||
}
|
||||
|
||||
body, _ := json.Marshal(createReq)
|
||||
resp2, err := http.Post(qdrantURL+"/collections", "application/json", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp2.Body.Close()
|
||||
|
||||
logger.Printf("Created collection %s with dimension %d", collection, dimension)
|
||||
return nil
|
||||
}
|
||||
|
||||
func uploadToQdrant(translations []Translation, embeddings [][]float64) error {
|
||||
points := make([]QdrantPoint, 0, len(translations))
|
||||
|
||||
for i, t := range translations {
|
||||
if embeddings[i] == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
pointID := uuid.New().String()
|
||||
|
||||
payload := map[string]interface{}{
|
||||
"news_id": t.NoticiaID,
|
||||
"traduccion_id": t.ID,
|
||||
"titulo": t.Titulo,
|
||||
"resumen": t.Resumen,
|
||||
"url": t.URL,
|
||||
"fuente_nombre": t.FuenteNombre,
|
||||
"lang": t.Lang,
|
||||
}
|
||||
|
||||
if t.Fecha != nil {
|
||||
payload["fecha"] = t.Fecha.Format(time.RFC3339)
|
||||
}
|
||||
if t.CategoriaID != nil {
|
||||
payload["categoria_id"] = *t.CategoriaID
|
||||
}
|
||||
if t.PaisID != nil {
|
||||
payload["pais_id"] = *t.PaisID
|
||||
}
|
||||
|
||||
points = append(points, QdrantPoint{
|
||||
ID: pointID,
|
||||
Vector: embeddings[i],
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
|
||||
if len(points) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
reqBody := QdrantUpsertRequest{Points: points}
|
||||
body, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/collections/%s/points", qdrantURL, collection)
|
||||
resp, err := http.Post(url, "application/json", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 && resp.StatusCode != 202 {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("Qdrant returned status %d: %s", resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func updateTranslationStatus(ctx context.Context, translations []Translation, pointIDs []string) error {
|
||||
for i, t := range translations {
|
||||
if i >= len(pointIDs) || pointIDs[i] == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
UPDATE traducciones
|
||||
SET
|
||||
vectorized = TRUE,
|
||||
vectorization_date = NOW(),
|
||||
qdrant_point_id = $1
|
||||
WHERE id = $2
|
||||
`, pointIDs[i], t.ID)
|
||||
|
||||
if err != nil {
|
||||
logger.Printf("Error updating translation %d: %v", t.ID, err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getStats(ctx context.Context) (total, vectorized, pending int, err error) {
|
||||
err = dbPool.QueryRow(ctx, `
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE vectorized = TRUE) as vectorized,
|
||||
COUNT(*) FILTER (WHERE vectorized = FALSE AND status = 'done') as pending
|
||||
FROM traducciones
|
||||
WHERE lang_to = 'es'
|
||||
`).Scan(&total, &vectorized, &pending)
|
||||
|
||||
return total, vectorized, pending, err
|
||||
}
|
||||
|
||||
func main() {
|
||||
loadConfig()
|
||||
logger.Println("Starting Qdrant Vectorization Worker")
|
||||
|
||||
cfg := workers.LoadDBConfig()
|
||||
if err := workers.Connect(cfg); err != nil {
|
||||
logger.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
dbPool = workers.GetPool()
|
||||
defer workers.Close()
|
||||
|
||||
logger.Println("Connected to PostgreSQL")
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
if err := ensureCollection(); err != nil {
|
||||
logger.Printf("Warning: Could not ensure collection: %v", err)
|
||||
}
|
||||
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
logger.Println("Shutting down...")
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
logger.Printf("Config: qdrant=%s, ollama=%s, collection=%s, sleep=%ds, batch=%d",
|
||||
qdrantURL, ollamaURL, collection, sleepSec, batchSize)
|
||||
|
||||
totalProcessed := 0
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-time.After(time.Duration(sleepSec) * time.Second):
|
||||
translations, err := getPendingTranslations(ctx)
|
||||
if err != nil {
|
||||
logger.Printf("Error fetching pending translations: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if len(translations) == 0 {
|
||||
logger.Println("No pending translations to process")
|
||||
continue
|
||||
}
|
||||
|
||||
logger.Printf("Processing %d translations...", len(translations))
|
||||
|
||||
// Generate embeddings
|
||||
embeddings := make([][]float64, len(translations))
|
||||
for i, t := range translations {
|
||||
text := fmt.Sprintf("%s %s", t.Titulo, t.Resumen)
|
||||
emb, err := generateEmbedding(text)
|
||||
if err != nil {
|
||||
logger.Printf("Error generating embedding for %d: %v", t.ID, err)
|
||||
continue
|
||||
}
|
||||
embeddings[i] = emb
|
||||
}
|
||||
|
||||
// Upload to Qdrant
|
||||
if err := uploadToQdrant(translations, embeddings); err != nil {
|
||||
logger.Printf("Error uploading to Qdrant: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Update DB status
|
||||
pointIDs := make([]string, len(translations))
|
||||
for i := range translations {
|
||||
pointIDs[i] = uuid.New().String()
|
||||
}
|
||||
|
||||
if err := updateTranslationStatus(ctx, translations, pointIDs); err != nil {
|
||||
logger.Printf("Error updating status: %v", err)
|
||||
}
|
||||
|
||||
totalProcessed += len(translations)
|
||||
logger.Printf("Processed %d translations (total: %d)", len(translations), totalProcessed)
|
||||
|
||||
total, vectorized, pending, err := getStats(ctx)
|
||||
if err == nil {
|
||||
logger.Printf("Stats: total=%d, vectorized=%d, pending=%d", total, vectorized, pending)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
384
backend/cmd/related/main.go
Normal file
384
backend/cmd/related/main.go
Normal file
|
|
@ -0,0 +1,384 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/rss2/backend/internal/workers"
|
||||
)
|
||||
|
||||
var (
|
||||
logger *log.Logger
|
||||
dbPool *pgxpool.Pool
|
||||
sleepSec = 10
|
||||
topK = 10
|
||||
batchSz = 200
|
||||
minScore = 0.0
|
||||
)
|
||||
|
||||
func init() {
|
||||
logger = log.New(os.Stdout, "[RELATED] ", log.LstdFlags)
|
||||
}
|
||||
|
||||
func loadConfig() {
|
||||
sleepSec = getEnvInt("RELATED_SLEEP", 10)
|
||||
topK = getEnvInt("RELATED_TOPK", 10)
|
||||
batchSz = getEnvInt("RELATED_BATCH", 200)
|
||||
minScore = getEnvFloat("RELATED_MIN_SCORE", 0.0)
|
||||
}
|
||||
|
||||
func getEnvInt(key string, defaultValue int) int {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
func getEnvFloat(key string, defaultValue float64) float64 {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if floatVal, err := strconv.ParseFloat(value, 64); err == nil {
|
||||
return floatVal
|
||||
}
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
type Translation struct {
|
||||
ID int64
|
||||
Titulo string
|
||||
Resumen string
|
||||
Embedding []float64
|
||||
}
|
||||
|
||||
func ensureSchema(ctx context.Context) error {
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
CREATE TABLE IF NOT EXISTS related_noticias (
|
||||
traduccion_id INTEGER REFERENCES traducciones(id) ON DELETE CASCADE,
|
||||
related_traduccion_id INTEGER REFERENCES traducciones(id) ON DELETE CASCADE,
|
||||
score FLOAT NOT NULL DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
PRIMARY KEY (traduccion_id, related_traduccion_id)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Ensure traduccion_embeddings table exists
|
||||
_, err = dbPool.Exec(ctx, `
|
||||
CREATE TABLE IF NOT EXISTS traduccion_embeddings (
|
||||
id SERIAL PRIMARY KEY,
|
||||
traduccion_id INTEGER NOT NULL REFERENCES traducciones(id) ON DELETE CASCADE,
|
||||
model TEXT NOT NULL,
|
||||
dim INTEGER NOT NULL,
|
||||
embedding DOUBLE PRECISION[] NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
UNIQUE (traduccion_id, model)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = dbPool.Exec(ctx, `
|
||||
CREATE INDEX IF NOT EXISTS idx_tr_emb_model ON traduccion_embeddings(model);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = dbPool.Exec(ctx, `
|
||||
CREATE INDEX IF NOT EXISTS idx_tr_emb_traduccion_id ON traduccion_embeddings(traduccion_id);
|
||||
`)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func fetchAllEmbeddings(ctx context.Context, model string) ([]Translation, error) {
|
||||
rows, err := dbPool.Query(ctx, `
|
||||
SELECT e.traduccion_id,
|
||||
COALESCE(NULLIF(t.titulo_trad,''), ''),
|
||||
COALESCE(NULLIF(t.resumen_trad,''), ''),
|
||||
e.embedding
|
||||
FROM traduccion_embeddings e
|
||||
JOIN traducciones t ON t.id = e.traduccion_id
|
||||
WHERE e.model = $1
|
||||
AND t.status = 'done'
|
||||
AND t.lang_to = 'es'
|
||||
`, model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var translations []Translation
|
||||
for rows.Next() {
|
||||
var t Translation
|
||||
if err := rows.Scan(&t.ID, &t.Titulo, &t.Resumen, &t.Embedding); err != nil {
|
||||
continue
|
||||
}
|
||||
translations = append(translations, t)
|
||||
}
|
||||
return translations, nil
|
||||
}
|
||||
|
||||
func fetchPendingIDs(ctx context.Context, model string, limit int) ([]int64, error) {
|
||||
rows, err := dbPool.Query(ctx, `
|
||||
SELECT t.id
|
||||
FROM traducciones t
|
||||
JOIN traduccion_embeddings e ON e.traduccion_id = t.id AND e.model = $1
|
||||
LEFT JOIN related_noticias r ON r.traduccion_id = t.id
|
||||
WHERE t.lang_to = 'es'
|
||||
AND t.status = 'done'
|
||||
GROUP BY t.id
|
||||
HAVING COUNT(r.related_traduccion_id) = 0
|
||||
ORDER BY t.id DESC
|
||||
LIMIT $2
|
||||
`, model, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var ids []int64
|
||||
for rows.Next() {
|
||||
var id int64
|
||||
if err := rows.Scan(&id); err != nil {
|
||||
continue
|
||||
}
|
||||
ids = append(ids, id)
|
||||
}
|
||||
return ids, nil
|
||||
}
|
||||
|
||||
func cosineSimilarity(a, b []float64) float64 {
|
||||
if len(a) != len(b) || len(a) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
var dotProduct, normA, normB float64
|
||||
for i := range a {
|
||||
dotProduct += a[i] * b[i]
|
||||
normA += a[i] * a[i]
|
||||
normB += b[i] * b[i]
|
||||
}
|
||||
|
||||
normA = sqrt(normA)
|
||||
normB = sqrt(normB)
|
||||
|
||||
if normA == 0 || normB == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return dotProduct / (normA * normB)
|
||||
}
|
||||
|
||||
func sqrt(x float64) float64 {
|
||||
if x <= 0 {
|
||||
return 0
|
||||
}
|
||||
// Simple Newton-Raphson
|
||||
z := x
|
||||
for i := 0; i < 20; i++ {
|
||||
z = (z + x/z) / 2
|
||||
}
|
||||
return z
|
||||
}
|
||||
|
||||
func findTopK(query Embedding, candidates []Translation, k int, minScore float64) []struct {
|
||||
ID int64
|
||||
Score float64
|
||||
} {
|
||||
type sim struct {
|
||||
id int64
|
||||
score float64
|
||||
}
|
||||
|
||||
var similarities []sim
|
||||
|
||||
for _, c := range candidates {
|
||||
if int64(c.ID) == query.ID {
|
||||
continue
|
||||
}
|
||||
|
||||
score := cosineSimilarity(query.Embedding, c.Embedding)
|
||||
if score <= minScore {
|
||||
continue
|
||||
}
|
||||
|
||||
similarities = append(similarities, sim{int64(c.ID), score})
|
||||
}
|
||||
|
||||
// Sort by score descending
|
||||
for i := 0; i < len(similarities)-1; i++ {
|
||||
for j := i + 1; j < len(similarities); j++ {
|
||||
if similarities[j].score > similarities[i].score {
|
||||
similarities[i], similarities[j] = similarities[j], similarities[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(similarities) > k {
|
||||
similarities = similarities[:k]
|
||||
}
|
||||
|
||||
result := make([]struct {
|
||||
ID int64
|
||||
Score float64
|
||||
}, len(similarities))
|
||||
for i, s := range similarities {
|
||||
result[i] = struct {
|
||||
ID int64
|
||||
Score float64
|
||||
}{s.id, s.score}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
type Embedding struct {
|
||||
ID int64
|
||||
Embedding []float64
|
||||
}
|
||||
|
||||
func findEmbeddingByID(embeddings []Embedding, id int64) *Embedding {
|
||||
for i := range embeddings {
|
||||
if embeddings[i].ID == id {
|
||||
return &embeddings[i]
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func insertRelated(ctx context.Context, traduccionID int64, related []struct {
|
||||
ID int64
|
||||
Score float64
|
||||
}) error {
|
||||
if len(related) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, r := range related {
|
||||
if r.Score <= 0 {
|
||||
continue
|
||||
}
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
INSERT INTO related_noticias (traduccion_id, related_traduccion_id, score)
|
||||
VALUES ($1, $2, $3)
|
||||
ON CONFLICT (traduccion_id, related_traduccion_id)
|
||||
DO UPDATE SET score = EXCLUDED.score
|
||||
`, traduccionID, r.ID, r.Score)
|
||||
if err != nil {
|
||||
logger.Printf("Error inserting related: %v", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func processBatch(ctx context.Context, model string) (int, error) {
|
||||
// Fetch all embeddings once
|
||||
allTranslations, err := fetchAllEmbeddings(ctx, model)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if len(allTranslations) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// Convert to Embedding format for easier lookup
|
||||
var allEmbeddings []Embedding
|
||||
for _, t := range allTranslations {
|
||||
if t.Embedding != nil {
|
||||
allEmbeddings = append(allEmbeddings, Embedding{ID: t.ID, Embedding: t.Embedding})
|
||||
}
|
||||
}
|
||||
|
||||
// Get pending IDs
|
||||
pendingIDs, err := fetchPendingIDs(ctx, model, batchSz)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if len(pendingIDs) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
processed := 0
|
||||
|
||||
for _, tradID := range pendingIDs {
|
||||
emb := findEmbeddingByID(allEmbeddings, tradID)
|
||||
if emb == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
topRelated := findTopK(*emb, allTranslations, topK, minScore)
|
||||
|
||||
if err := insertRelated(ctx, tradID, topRelated); err != nil {
|
||||
logger.Printf("Error inserting related for %d: %v", tradID, err)
|
||||
continue
|
||||
}
|
||||
|
||||
processed++
|
||||
}
|
||||
|
||||
return processed, nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
loadConfig()
|
||||
logger.Println("Starting Related News Worker")
|
||||
|
||||
cfg := workers.LoadDBConfig()
|
||||
if err := workers.Connect(cfg); err != nil {
|
||||
logger.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
dbPool = workers.GetPool()
|
||||
defer workers.Close()
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Ensure schema
|
||||
if err := ensureSchema(ctx); err != nil {
|
||||
logger.Printf("Error ensuring schema: %v", err)
|
||||
}
|
||||
|
||||
model := os.Getenv("EMB_MODEL")
|
||||
if model == "" {
|
||||
model = "mxbai-embed-large"
|
||||
}
|
||||
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
logger.Println("Shutting down...")
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
logger.Printf("Config: sleep=%ds, topK=%d, batch=%d, model=%s", sleepSec, topK, batchSz, model)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-time.After(time.Duration(sleepSec) * time.Second):
|
||||
count, err := processBatch(ctx, model)
|
||||
if err != nil {
|
||||
logger.Printf("Error processing batch: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
logger.Printf("Generated related news for %d translations", count)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
330
backend/cmd/scraper/main.go
Normal file
330
backend/cmd/scraper/main.go
Normal file
|
|
@ -0,0 +1,330 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/rss2/backend/internal/workers"
|
||||
)
|
||||
|
||||
var (
|
||||
logger *log.Logger
|
||||
dbPool *workers.Config
|
||||
pool *pgxpool.Pool
|
||||
sleepInterval = 60
|
||||
batchSize = 10
|
||||
)
|
||||
|
||||
type URLSource struct {
|
||||
ID int64
|
||||
Nombre string
|
||||
URL string
|
||||
CategoriaID *int64
|
||||
PaisID *int64
|
||||
Idioma *string
|
||||
Active bool
|
||||
}
|
||||
|
||||
type Article struct {
|
||||
Title string
|
||||
Summary string
|
||||
Content string
|
||||
URL string
|
||||
ImageURL string
|
||||
PubDate *time.Time
|
||||
}
|
||||
|
||||
func init() {
|
||||
logger = log.New(os.Stdout, "[SCRAPER] ", log.LstdFlags)
|
||||
logger.SetOutput(os.Stdout)
|
||||
}
|
||||
|
||||
func loadConfig() {
|
||||
sleepInterval = getEnvInt("SCRAPER_SLEEP", 60)
|
||||
batchSize = getEnvInt("SCRAPER_BATCH", 10)
|
||||
}
|
||||
|
||||
func getEnvInt(key string, defaultValue int) int {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
func getActiveURLs(ctx context.Context) ([]URLSource, error) {
|
||||
rows, err := pool.Query(ctx, `
|
||||
SELECT id, nombre, url, categoria_id, pais_id, idioma, activo
|
||||
FROM fuentes_url
|
||||
WHERE activo = true
|
||||
`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var sources []URLSource
|
||||
for rows.Next() {
|
||||
var s URLSource
|
||||
err := rows.Scan(&s.ID, &s.Nombre, &s.URL, &s.CategoriaID, &s.PaisID, &s.Idioma, &s.Active)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sources = append(sources, s)
|
||||
}
|
||||
return sources, nil
|
||||
}
|
||||
|
||||
func updateSourceStatus(ctx context.Context, sourceID int64, status, message string, httpCode int) error {
|
||||
_, err := pool.Exec(ctx, `
|
||||
UPDATE fuentes_url
|
||||
SET last_check = NOW(),
|
||||
last_status = $1,
|
||||
status_message = $2,
|
||||
last_http_code = $3
|
||||
WHERE id = $4
|
||||
`, status, message, httpCode, sourceID)
|
||||
return err
|
||||
}
|
||||
|
||||
func extractArticle(source URLSource) (*Article, error) {
|
||||
client := &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", source.URL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
article := &Article{
|
||||
URL: source.URL,
|
||||
}
|
||||
|
||||
// Extract title
|
||||
article.Title = doc.Find("meta[property='og:title']").First().AttrOr("content", "")
|
||||
if article.Title == "" {
|
||||
article.Title = doc.Find("meta[name='title']").First().AttrOr("content", "")
|
||||
}
|
||||
if article.Title == "" {
|
||||
article.Title = doc.Find("h1").First().Text()
|
||||
}
|
||||
if article.Title == "" {
|
||||
article.Title = doc.Find("title").First().Text()
|
||||
}
|
||||
|
||||
// Extract description/summary
|
||||
article.Summary = doc.Find("meta[property='og:description']").First().AttrOr("content", "")
|
||||
if article.Summary == "" {
|
||||
article.Summary = doc.Find("meta[name='description']").First().AttrOr("content", "")
|
||||
}
|
||||
|
||||
// Extract image
|
||||
article.ImageURL = doc.Find("meta[property='og:image']").First().AttrOr("content", "")
|
||||
|
||||
// Extract main content - try common selectors
|
||||
contentSelectors := []string{
|
||||
"article",
|
||||
"[role='main']",
|
||||
"main",
|
||||
".article-content",
|
||||
".post-content",
|
||||
".entry-content",
|
||||
".content",
|
||||
"#content",
|
||||
}
|
||||
|
||||
for _, sel := range contentSelectors {
|
||||
content := doc.Find(sel).First()
|
||||
if content.Length() > 0 {
|
||||
article.Content = content.Text()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up
|
||||
article.Title = strings.TrimSpace(article.Title)
|
||||
article.Summary = strings.TrimSpace(article.Summary)
|
||||
article.Content = strings.TrimSpace(article.Content)
|
||||
|
||||
// Truncate summary if too long
|
||||
if len(article.Summary) > 500 {
|
||||
article.Summary = article.Summary[:500]
|
||||
}
|
||||
|
||||
return article, nil
|
||||
}
|
||||
|
||||
func saveArticle(ctx context.Context, source URLSource, article *Article) (bool, error) {
|
||||
finalURL := article.URL
|
||||
if finalURL == "" {
|
||||
finalURL = source.URL
|
||||
}
|
||||
|
||||
// Generate ID from URL
|
||||
articleID := fmt.Sprintf("%x", md5.Sum([]byte(finalURL)))
|
||||
|
||||
// Check if exists
|
||||
var exists bool
|
||||
err := pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM noticias WHERE id = $1)", articleID).Scan(&exists)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if exists {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
title := article.Title
|
||||
if title == "" {
|
||||
title = "Sin título"
|
||||
}
|
||||
|
||||
summary := article.Summary
|
||||
if summary == "" && article.Content != "" {
|
||||
summary = article.Content
|
||||
if len(summary) > 500 {
|
||||
summary = summary[:500]
|
||||
}
|
||||
}
|
||||
|
||||
pubDate := time.Now()
|
||||
if article.PubDate != nil {
|
||||
pubDate = *article.PubDate
|
||||
}
|
||||
|
||||
_, err = pool.Exec(ctx, `
|
||||
INSERT INTO noticias (
|
||||
id, titulo, resumen, url, fecha, imagen_url,
|
||||
fuente_nombre, categoria_id, pais_id
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
`, articleID, title, summary, finalURL, pubDate, article.ImageURL,
|
||||
source.Nombre, source.CategoriaID, source.PaisID)
|
||||
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func processSource(ctx context.Context, source URLSource) {
|
||||
logger.Printf("Processing: %s (%s)", source.Nombre, source.URL)
|
||||
|
||||
article, err := extractArticle(source)
|
||||
if err != nil {
|
||||
logger.Printf("Error extracting article from %s: %v", source.URL, err)
|
||||
status := "ERROR"
|
||||
if strings.Contains(err.Error(), "HTTP") {
|
||||
status = "ERROR_HTTP"
|
||||
}
|
||||
updateSourceStatus(ctx, source.ID, status, err.Error()[:200], 0)
|
||||
return
|
||||
}
|
||||
|
||||
if article.Title == "" {
|
||||
logger.Printf("No title found for %s", source.URL)
|
||||
updateSourceStatus(ctx, source.ID, "ERROR_PARSE", "No title extracted", 200)
|
||||
return
|
||||
}
|
||||
|
||||
saved, err := saveArticle(ctx, source, article)
|
||||
if err != nil {
|
||||
logger.Printf("Error saving article: %v", err)
|
||||
updateSourceStatus(ctx, source.ID, "ERROR_DB", err.Error()[:200], 0)
|
||||
return
|
||||
}
|
||||
|
||||
if saved {
|
||||
logger.Printf("Saved: %s", article.Title)
|
||||
updateSourceStatus(ctx, source.ID, "OK", "News created successfully", 200)
|
||||
} else {
|
||||
logger.Printf("Already exists: %s", article.Title)
|
||||
updateSourceStatus(ctx, source.ID, "OK", "News already exists", 200)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
loadConfig()
|
||||
logger.Println("Starting Scraper Worker")
|
||||
|
||||
cfg := workers.LoadDBConfig()
|
||||
if err := workers.Connect(cfg); err != nil {
|
||||
logger.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
pool = workers.GetPool()
|
||||
defer workers.Close()
|
||||
|
||||
logger.Println("Connected to PostgreSQL")
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Handle shutdown
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
logger.Println("Shutting down...")
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
logger.Printf("Config: sleep=%ds, batch=%d", sleepInterval, batchSize)
|
||||
|
||||
ticker := time.NewTicker(time.Duration(sleepInterval) * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
sources, err := getActiveURLs(ctx)
|
||||
if err != nil {
|
||||
logger.Printf("Error fetching URLs: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if len(sources) == 0 {
|
||||
logger.Println("No active URLs to process")
|
||||
continue
|
||||
}
|
||||
|
||||
logger.Printf("Processing %d sources", len(sources))
|
||||
|
||||
for _, source := range sources {
|
||||
processSource(ctx, source)
|
||||
time.Sleep(2 * time.Second) // Rate limiting
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
190
backend/cmd/server/main.go
Normal file
190
backend/cmd/server/main.go
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/rss2/backend/internal/cache"
|
||||
"github.com/rss2/backend/internal/config"
|
||||
"github.com/rss2/backend/internal/db"
|
||||
"github.com/rss2/backend/internal/handlers"
|
||||
"github.com/rss2/backend/internal/middleware"
|
||||
"github.com/rss2/backend/internal/services"
|
||||
)
|
||||
|
||||
func initDB() {
|
||||
ctx := context.Background()
|
||||
|
||||
// Crear tabla entity_aliases si no existe
|
||||
_, err := db.GetPool().Exec(ctx, `
|
||||
CREATE TABLE IF NOT EXISTS entity_aliases (
|
||||
id SERIAL PRIMARY KEY,
|
||||
canonical_name VARCHAR(255) NOT NULL,
|
||||
alias VARCHAR(255) NOT NULL,
|
||||
tipo VARCHAR(50) NOT NULL CHECK (tipo IN ('persona', 'organizacion', 'lugar', 'tema')),
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
UNIQUE(alias, tipo)
|
||||
)
|
||||
`)
|
||||
if err != nil {
|
||||
log.Printf("Warning: Could not create entity_aliases table: %v", err)
|
||||
} else {
|
||||
log.Println("Table entity_aliases ready")
|
||||
}
|
||||
|
||||
// Añadir columna role a users si no existe
|
||||
_, err = db.GetPool().Exec(ctx, `
|
||||
ALTER TABLE users ADD COLUMN IF NOT EXISTS role VARCHAR(20) DEFAULT 'user'
|
||||
`)
|
||||
if err != nil {
|
||||
log.Printf("Warning: Could not add role column: %v", err)
|
||||
} else {
|
||||
log.Println("Column role ready")
|
||||
}
|
||||
|
||||
// Crear tabla de configuración si no existe
|
||||
_, err = db.GetPool().Exec(ctx, `
|
||||
CREATE TABLE IF NOT EXISTS config (
|
||||
key VARCHAR(100) PRIMARY KEY,
|
||||
value TEXT,
|
||||
updated_at TIMESTAMP DEFAULT NOW()
|
||||
)
|
||||
`)
|
||||
if err != nil {
|
||||
log.Printf("Warning: Could not create config table: %v", err)
|
||||
} else {
|
||||
log.Println("Table config ready")
|
||||
}
|
||||
|
||||
// Insertar configuración por defecto si no existe
|
||||
db.GetPool().Exec(ctx, `
|
||||
INSERT INTO config (key, value) VALUES ('translator_type', 'cpu')
|
||||
ON CONFLICT (key) DO NOTHING
|
||||
`)
|
||||
db.GetPool().Exec(ctx, `
|
||||
INSERT INTO config (key, value) VALUES ('translator_workers', '2')
|
||||
ON CONFLICT (key) DO NOTHING
|
||||
`)
|
||||
db.GetPool().Exec(ctx, `
|
||||
INSERT INTO config (key, value) VALUES ('translator_status', 'stopped')
|
||||
ON CONFLICT (key) DO NOTHING
|
||||
`)
|
||||
}
|
||||
|
||||
func main() {
|
||||
cfg := config.Load()
|
||||
|
||||
if err := db.Connect(cfg.DatabaseURL); err != nil {
|
||||
log.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
log.Println("Connected to PostgreSQL")
|
||||
|
||||
// Auto-setup DB tables
|
||||
initDB()
|
||||
|
||||
if err := cache.Connect(cfg.RedisURL); err != nil {
|
||||
log.Printf("Warning: Failed to connect to Redis: %v", err)
|
||||
} else {
|
||||
defer cache.Close()
|
||||
log.Println("Connected to Redis")
|
||||
}
|
||||
|
||||
services.Init(cfg)
|
||||
|
||||
r := gin.Default()
|
||||
|
||||
r.Use(middleware.CORSMiddleware())
|
||||
r.Use(middleware.LoggerMiddleware())
|
||||
|
||||
r.GET("/health", func(c *gin.Context) {
|
||||
c.JSON(200, gin.H{"status": "ok"})
|
||||
})
|
||||
|
||||
api := r.Group("/api")
|
||||
{
|
||||
// Serve static images downloaded by wiki_worker
|
||||
api.StaticFS("/wiki-images", gin.Dir("/app/data/wiki_images", false))
|
||||
|
||||
api.POST("/auth/login", handlers.Login)
|
||||
api.POST("/auth/register", handlers.Register)
|
||||
api.GET("/auth/check-first-user", handlers.CheckFirstUser)
|
||||
|
||||
news := api.Group("/news")
|
||||
{
|
||||
news.GET("", handlers.GetNews)
|
||||
news.GET("/:id", handlers.GetNewsByID)
|
||||
news.DELETE("/:id", middleware.AuthRequired(), handlers.DeleteNews)
|
||||
}
|
||||
|
||||
feeds := api.Group("/feeds")
|
||||
{
|
||||
feeds.GET("", handlers.GetFeeds)
|
||||
feeds.GET("/export", handlers.ExportFeeds)
|
||||
feeds.GET("/:id", handlers.GetFeedByID)
|
||||
feeds.POST("", middleware.AuthRequired(), handlers.CreateFeed)
|
||||
feeds.POST("/import", middleware.AuthRequired(), handlers.ImportFeeds)
|
||||
feeds.PUT("/:id", middleware.AuthRequired(), handlers.UpdateFeed)
|
||||
feeds.DELETE("/:id", middleware.AuthRequired(), handlers.DeleteFeed)
|
||||
feeds.POST("/:id/toggle", middleware.AuthRequired(), handlers.ToggleFeedActive)
|
||||
feeds.POST("/:id/reactivate", middleware.AuthRequired(), handlers.ReactivateFeed)
|
||||
}
|
||||
|
||||
api.GET("/search", handlers.SearchNews)
|
||||
|
||||
api.GET("/entities", handlers.GetEntities)
|
||||
|
||||
api.GET("/stats", handlers.GetStats)
|
||||
|
||||
api.GET("/categories", handlers.GetCategories)
|
||||
api.GET("/countries", handlers.GetCountries)
|
||||
|
||||
admin := api.Group("/admin")
|
||||
admin.Use(middleware.AuthRequired(), middleware.AdminRequired())
|
||||
{
|
||||
admin.POST("/aliases", handlers.CreateAlias)
|
||||
admin.GET("/aliases/export", handlers.ExportAliases)
|
||||
admin.POST("/aliases/import", handlers.ImportAliases)
|
||||
admin.POST("/entities/retype", handlers.PatchEntityTipo)
|
||||
admin.GET("/backup", handlers.BackupDatabase)
|
||||
admin.GET("/backup/news", handlers.BackupNewsZipped)
|
||||
admin.GET("/users", handlers.GetUsers)
|
||||
admin.POST("/users/:id/promote", handlers.PromoteUser)
|
||||
admin.POST("/users/:id/demote", handlers.DemoteUser)
|
||||
admin.POST("/reset-db", handlers.ResetDatabase)
|
||||
admin.GET("/workers/status", handlers.GetWorkerStatus)
|
||||
admin.POST("/workers/config", handlers.SetWorkerConfig)
|
||||
admin.POST("/workers/start", handlers.StartWorkers)
|
||||
admin.POST("/workers/stop", handlers.StopWorkers)
|
||||
}
|
||||
|
||||
auth := api.Group("/auth")
|
||||
auth.Use(middleware.AuthRequired())
|
||||
{
|
||||
auth.GET("/me", handlers.GetCurrentUser)
|
||||
}
|
||||
}
|
||||
|
||||
middleware.SetJWTSecret(cfg.SecretKey)
|
||||
|
||||
port := cfg.ServerPort
|
||||
addr := fmt.Sprintf(":%s", port)
|
||||
|
||||
go func() {
|
||||
log.Printf("Server starting on %s", addr)
|
||||
if err := r.Run(addr); err != nil {
|
||||
log.Fatalf("Failed to start server: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
quit := make(chan os.Signal, 1)
|
||||
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
||||
<-quit
|
||||
|
||||
log.Println("Shutting down server...")
|
||||
}
|
||||
383
backend/cmd/topics/main.go
Normal file
383
backend/cmd/topics/main.go
Normal file
|
|
@ -0,0 +1,383 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/rss2/backend/internal/workers"
|
||||
)
|
||||
|
||||
var (
|
||||
logger *log.Logger
|
||||
dbPool *pgxpool.Pool
|
||||
sleepSec = 10
|
||||
batchSz = 500
|
||||
)
|
||||
|
||||
type Topic struct {
|
||||
ID int64
|
||||
Weight int
|
||||
Keywords []string
|
||||
}
|
||||
|
||||
type Country struct {
|
||||
ID int64
|
||||
Name string
|
||||
Keywords []string
|
||||
}
|
||||
|
||||
func init() {
|
||||
logger = log.New(os.Stdout, "[TOPICS] ", log.LstdFlags)
|
||||
}
|
||||
|
||||
func loadConfig() {
|
||||
sleepSec = getEnvInt("TOPICS_SLEEP", 10)
|
||||
batchSz = getEnvInt("TOPICS_BATCH", 500)
|
||||
}
|
||||
|
||||
func getEnvInt(key string, defaultValue int) int {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
func ensureSchema(ctx context.Context) error {
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
CREATE TABLE IF NOT EXISTS topics (
|
||||
id SERIAL PRIMARY KEY,
|
||||
slug VARCHAR(50) UNIQUE NOT NULL,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
weight INTEGER DEFAULT 1,
|
||||
keywords TEXT,
|
||||
group_name VARCHAR(50)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = dbPool.Exec(ctx, `
|
||||
CREATE TABLE IF NOT EXISTS news_topics (
|
||||
noticia_id VARCHAR(32) REFERENCES noticias(id) ON DELETE CASCADE,
|
||||
topic_id INTEGER REFERENCES topics(id) ON DELETE CASCADE,
|
||||
score INTEGER DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT NOW(),
|
||||
PRIMARY KEY (noticia_id, topic_id)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = dbPool.Exec(ctx, `
|
||||
ALTER TABLE noticias ADD COLUMN IF NOT EXISTS topics_processed BOOLEAN DEFAULT FALSE;
|
||||
`)
|
||||
return err
|
||||
}
|
||||
|
||||
func loadTopics(ctx context.Context) ([]Topic, error) {
|
||||
rows, err := dbPool.Query(ctx, "SELECT id, weight, keywords FROM topics")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var topics []Topic
|
||||
for rows.Next() {
|
||||
var t Topic
|
||||
var kwStr *string
|
||||
if err := rows.Scan(&t.ID, &t.Weight, &kwStr); err != nil {
|
||||
continue
|
||||
}
|
||||
if kwStr != nil {
|
||||
keywords := strings.Split(*kwStr, ",")
|
||||
for i := range keywords {
|
||||
keywords[i] = strings.ToLower(strings.TrimSpace(keywords[i]))
|
||||
}
|
||||
t.Keywords = keywords
|
||||
}
|
||||
topics = append(topics, t)
|
||||
}
|
||||
return topics, nil
|
||||
}
|
||||
|
||||
func loadCountries(ctx context.Context) ([]Country, error) {
|
||||
rows, err := dbPool.Query(ctx, "SELECT id, nombre FROM paises")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
aliases := map[string][]string{
|
||||
"Estados Unidos": {"eeuu", "ee.uu.", "usa", "estadounidense", "washington"},
|
||||
"Rusia": {"ruso", "rusa", "moscú", "kremlin"},
|
||||
"China": {"chino", "china", "pekin", "beijing"},
|
||||
"Ucrania": {"ucraniano", "kiev", "kyiv"},
|
||||
"Israel": {"israelí", "tel aviv", "jerusalén"},
|
||||
"España": {"español", "madrid"},
|
||||
"Reino Unido": {"uk", "londres", "británico"},
|
||||
"Francia": {"francés", "parís"},
|
||||
"Alemania": {"alemán", "berlín"},
|
||||
"Palestina": {"palestino", "gaza", "cisjordania"},
|
||||
"Irán": {"iraní", "teherán"},
|
||||
}
|
||||
|
||||
var countries []Country
|
||||
for rows.Next() {
|
||||
var c Country
|
||||
if err := rows.Scan(&c.ID, &c.Name); err != nil {
|
||||
continue
|
||||
}
|
||||
c.Keywords = []string{strings.ToLower(c.Name)}
|
||||
if kw, ok := aliases[c.Name]; ok {
|
||||
c.Keywords = append(c.Keywords, kw...)
|
||||
}
|
||||
countries = append(countries, c)
|
||||
}
|
||||
return countries, nil
|
||||
}
|
||||
|
||||
type NewsItem struct {
|
||||
ID string
|
||||
Titulo *string
|
||||
Resumen *string
|
||||
}
|
||||
|
||||
func fetchPendingNews(ctx context.Context, limit int) ([]NewsItem, error) {
|
||||
rows, err := dbPool.Query(ctx, `
|
||||
SELECT id, titulo, resumen
|
||||
FROM noticias
|
||||
WHERE topics_processed = FALSE
|
||||
ORDER BY fecha DESC
|
||||
LIMIT $1
|
||||
`, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var items []NewsItem
|
||||
for rows.Next() {
|
||||
var n NewsItem
|
||||
if err := rows.Scan(&n.ID, &n.Titulo, &n.Resumen); err != nil {
|
||||
continue
|
||||
}
|
||||
items = append(items, n)
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func findTopics(text string, topics []Topic) []struct {
|
||||
TopicID int64
|
||||
Score int
|
||||
} {
|
||||
text = strings.ToLower(text)
|
||||
var matches []struct {
|
||||
TopicID int64
|
||||
Score int
|
||||
}
|
||||
|
||||
for _, topic := range topics {
|
||||
count := 0
|
||||
for _, kw := range topic.Keywords {
|
||||
if strings.Contains(text, kw) {
|
||||
count++
|
||||
}
|
||||
}
|
||||
if count > 0 {
|
||||
matches = append(matches, struct {
|
||||
TopicID int64
|
||||
Score int
|
||||
}{topic.ID, topic.Weight * count})
|
||||
}
|
||||
}
|
||||
return matches
|
||||
}
|
||||
|
||||
func findBestCountry(text string, countries []Country) *int64 {
|
||||
text = strings.ToLower(text)
|
||||
bestID := new(int64)
|
||||
bestCount := 0
|
||||
|
||||
for _, c := range countries {
|
||||
count := 0
|
||||
for _, kw := range c.Keywords {
|
||||
if strings.Contains(text, kw) {
|
||||
count++
|
||||
}
|
||||
}
|
||||
if count > bestCount {
|
||||
bestCount = count
|
||||
*bestID = c.ID
|
||||
}
|
||||
}
|
||||
|
||||
if bestCount > 0 {
|
||||
return bestID
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func processBatch(ctx context.Context, topics []Topic, countries []Country) (int, error) {
|
||||
items, err := fetchPendingNews(ctx, batchSz)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if len(items) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
type topicMatch struct {
|
||||
NoticiaID string
|
||||
TopicID int64
|
||||
Score int
|
||||
}
|
||||
|
||||
type countryUpdate struct {
|
||||
PaisID int64
|
||||
NoticiaID string
|
||||
}
|
||||
|
||||
var topicMatches []topicMatch
|
||||
var countryUpdates []countryUpdate
|
||||
var processedIDs []string
|
||||
|
||||
for _, item := range items {
|
||||
var text string
|
||||
if item.Titulo != nil {
|
||||
text += *item.Titulo
|
||||
}
|
||||
if item.Resumen != nil {
|
||||
text += " " + *item.Resumen
|
||||
}
|
||||
|
||||
// Find topics
|
||||
matches := findTopics(text, topics)
|
||||
for _, m := range matches {
|
||||
topicMatches = append(topicMatches, topicMatch{item.ID, m.TopicID, m.Score})
|
||||
}
|
||||
|
||||
// Find best country
|
||||
if countryID := findBestCountry(text, countries); countryID != nil {
|
||||
countryUpdates = append(countryUpdates, countryUpdate{*countryID, item.ID})
|
||||
}
|
||||
|
||||
processedIDs = append(processedIDs, item.ID)
|
||||
}
|
||||
|
||||
// Insert topic relations
|
||||
if len(topicMatches) > 0 {
|
||||
for _, tm := range topicMatches {
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
INSERT INTO news_topics (noticia_id, topic_id, score)
|
||||
VALUES ($1, $2, $3)
|
||||
ON CONFLICT (noticia_id, topic_id) DO UPDATE SET score = EXCLUDED.score
|
||||
`, tm.NoticiaID, tm.TopicID, tm.Score)
|
||||
if err != nil {
|
||||
logger.Printf("Error inserting topic: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update country
|
||||
if len(countryUpdates) > 0 {
|
||||
for _, cu := range countryUpdates {
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
UPDATE noticias SET pais_id = $1 WHERE id = $2
|
||||
`, cu.PaisID, cu.NoticiaID)
|
||||
if err != nil {
|
||||
logger.Printf("Error updating country: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Mark as processed
|
||||
if len(processedIDs) > 0 {
|
||||
_, err := dbPool.Exec(ctx, `
|
||||
UPDATE noticias SET topics_processed = TRUE WHERE id = ANY($1)
|
||||
`, processedIDs)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return len(items), nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
loadConfig()
|
||||
logger.Println("Starting Topics Worker")
|
||||
|
||||
cfg := workers.LoadDBConfig()
|
||||
if err := workers.Connect(cfg); err != nil {
|
||||
logger.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
dbPool = workers.GetPool()
|
||||
defer workers.Close()
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Ensure schema
|
||||
if err := ensureSchema(ctx); err != nil {
|
||||
logger.Printf("Error ensuring schema: %v", err)
|
||||
}
|
||||
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
logger.Println("Shutting down...")
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
logger.Printf("Config: sleep=%ds, batch=%d", sleepSec, batchSz)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-time.After(time.Duration(sleepSec) * time.Second):
|
||||
topics, err := loadTopics(ctx)
|
||||
if err != nil {
|
||||
logger.Printf("Error loading topics: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if len(topics) == 0 {
|
||||
logger.Println("No topics found in DB")
|
||||
time.Sleep(time.Duration(sleepSec) * time.Second)
|
||||
continue
|
||||
}
|
||||
|
||||
countries, err := loadCountries(ctx)
|
||||
if err != nil {
|
||||
logger.Printf("Error loading countries: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
count, err := processBatch(ctx, topics, countries)
|
||||
if err != nil {
|
||||
logger.Printf("Error processing batch: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
logger.Printf("Processed %d news items", count)
|
||||
}
|
||||
|
||||
if count < batchSz {
|
||||
time.Sleep(time.Duration(sleepSec) * time.Second)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
267
backend/cmd/wiki_worker/main.go
Normal file
267
backend/cmd/wiki_worker/main.go
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/rss2/backend/internal/workers"
|
||||
)
|
||||
|
||||
var (
|
||||
logger *log.Logger
|
||||
pool *pgxpool.Pool
|
||||
sleepInterval = 30
|
||||
batchSize = 50
|
||||
imagesDir = "/app/data/wiki_images"
|
||||
)
|
||||
|
||||
type WikiSummary struct {
|
||||
Type string `json:"type"`
|
||||
Title string `json:"title"`
|
||||
DisplayTitle string `json:"displaytitle"`
|
||||
Extract string `json:"extract"`
|
||||
ContentUrls struct {
|
||||
Desktop struct {
|
||||
Page string `json:"page"`
|
||||
} `json:"desktop"`
|
||||
} `json:"content_urls"`
|
||||
Thumbnail *struct {
|
||||
Source string `json:"source"`
|
||||
Width int `json:"width"`
|
||||
Height int `json:"height"`
|
||||
} `json:"thumbnail"`
|
||||
}
|
||||
|
||||
type Tag struct {
|
||||
ID int64
|
||||
Valor string
|
||||
Tipo string
|
||||
}
|
||||
|
||||
func init() {
|
||||
logger = log.New(os.Stdout, "[WIKI_WORKER] ", log.LstdFlags)
|
||||
}
|
||||
|
||||
func getPendingTags(ctx context.Context) ([]Tag, error) {
|
||||
rows, err := pool.Query(ctx, `
|
||||
SELECT t.id, t.valor, t.tipo
|
||||
FROM tags t
|
||||
LEFT JOIN (
|
||||
SELECT tag_id, COUNT(*) as cnt
|
||||
FROM tags_noticia
|
||||
GROUP BY tag_id
|
||||
) c ON c.tag_id = t.id
|
||||
WHERE t.tipo IN ('persona', 'organizacion')
|
||||
AND t.wiki_checked = FALSE
|
||||
ORDER BY COALESCE(c.cnt, 0) DESC, t.id DESC
|
||||
LIMIT $1
|
||||
`, batchSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var tags []Tag
|
||||
for rows.Next() {
|
||||
var t Tag
|
||||
if err := rows.Scan(&t.ID, &t.Valor, &t.Tipo); err == nil {
|
||||
tags = append(tags, t)
|
||||
}
|
||||
}
|
||||
return tags, nil
|
||||
}
|
||||
|
||||
func downloadImage(imgURL, destPath string) error {
|
||||
client := &http.Client{Timeout: 15 * time.Second}
|
||||
req, err := http.NewRequest("GET", imgURL, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("User-Agent", "RSS2-WikiWorker/1.0 (https://github.com/proyecto/rss2)")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
out, err := os.Create(destPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
_, err = io.Copy(out, resp.Body)
|
||||
return err
|
||||
}
|
||||
|
||||
func fetchWikipediaInfo(valor string) (*WikiSummary, error) {
|
||||
// Normalize the value to be wiki-compatible
|
||||
title := strings.ReplaceAll(strings.TrimSpace(valor), " ", "_")
|
||||
encodedTitle := url.PathEscape(title)
|
||||
|
||||
apiURL := fmt.Sprintf("https://es.wikipedia.org/api/rest_v1/page/summary/%s", encodedTitle)
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
req, err := http.NewRequest("GET", apiURL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Per MediaWiki API policy: https://meta.wikimedia.org/wiki/User-Agent_policy
|
||||
req.Header.Set("User-Agent", "RSS2-WikiWorker/1.0 (pietrelinux@gmail.com)")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == 429 {
|
||||
return nil, fmt.Errorf("HTTP 429: Too Many Requests (Rate Limited)")
|
||||
}
|
||||
if resp.StatusCode == 404 {
|
||||
return nil, nil // Not found, but handled successfully without error
|
||||
}
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var summary WikiSummary
|
||||
if err := json.NewDecoder(resp.Body).Decode(&summary); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Filter out disambiguation pages
|
||||
if summary.Type == "disambiguation" {
|
||||
return nil, nil // Treat as not found to strictly avoid incorrect tooltips
|
||||
}
|
||||
|
||||
return &summary, nil
|
||||
}
|
||||
|
||||
func processTag(ctx context.Context, tag Tag) {
|
||||
logger.Printf("Procesando tag %d: %s", tag.ID, tag.Valor)
|
||||
|
||||
summary, err := fetchWikipediaInfo(tag.Valor)
|
||||
if err != nil {
|
||||
logger.Printf("Error al consultar Wikipedia para %s: %v", tag.Valor, err)
|
||||
return
|
||||
}
|
||||
|
||||
if summary == nil || summary.Extract == "" {
|
||||
// Not found or disambiguation
|
||||
_, _ = pool.Exec(ctx, "UPDATE tags SET wiki_checked = TRUE WHERE id = $1", tag.ID)
|
||||
logger.Printf("No se encontraron resultados válidos en Wikipedia para: %s", tag.Valor)
|
||||
return
|
||||
}
|
||||
|
||||
var localImagePath *string
|
||||
if summary.Thumbnail != nil && summary.Thumbnail.Source != "" {
|
||||
ext := ".jpg"
|
||||
if strings.HasSuffix(strings.ToLower(summary.Thumbnail.Source), ".png") {
|
||||
ext = ".png"
|
||||
}
|
||||
fileName := fmt.Sprintf("wiki_%d%s", tag.ID, ext)
|
||||
destPath := filepath.Join(imagesDir, fileName)
|
||||
|
||||
if err := downloadImage(summary.Thumbnail.Source, destPath); err != nil {
|
||||
logger.Printf("Error descargando imagen para %s: %v", tag.Valor, err)
|
||||
// Guardaremos la URL externa como fallback si falla la descarga
|
||||
src := summary.Thumbnail.Source
|
||||
localImagePath = &src
|
||||
} else {
|
||||
relativePath := "/api/wiki-images/" + fileName
|
||||
localImagePath = &relativePath
|
||||
}
|
||||
}
|
||||
|
||||
wikiURL := summary.ContentUrls.Desktop.Page
|
||||
|
||||
_, err = pool.Exec(ctx, `
|
||||
UPDATE tags
|
||||
SET wiki_summary = $1,
|
||||
wiki_url = $2,
|
||||
image_path = $3,
|
||||
wiki_checked = TRUE
|
||||
WHERE id = $4
|
||||
`, summary.Extract, wikiURL, localImagePath, tag.ID)
|
||||
|
||||
if err != nil {
|
||||
logger.Printf("Error al actualizar la base de datos para tag %d: %v", tag.ID, err)
|
||||
} else {
|
||||
logger.Printf("Actualizado con éxito: %s (Imagen: %v)", tag.Valor, localImagePath != nil)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
if val := os.Getenv("WIKI_SLEEP"); val != "" {
|
||||
if sleep, err := fmt.Sscanf(val, "%d", &sleepInterval); err == nil && sleep > 0 {
|
||||
sleepInterval = sleep
|
||||
}
|
||||
}
|
||||
|
||||
logger.Println("Iniciando Wiki Worker...")
|
||||
|
||||
if err := os.MkdirAll(imagesDir, 0755); err != nil {
|
||||
logger.Fatalf("Error creando directorio de imágenes: %v", err)
|
||||
}
|
||||
|
||||
cfg := workers.LoadDBConfig()
|
||||
if err := workers.Connect(cfg); err != nil {
|
||||
logger.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
pool = workers.GetPool()
|
||||
defer workers.Close()
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
logger.Println("Cerrando gracefully...")
|
||||
workers.Close()
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
logger.Printf("Configuración: sleep=%ds, batch=%d", sleepInterval, batchSize)
|
||||
|
||||
for {
|
||||
tags, err := getPendingTags(ctx)
|
||||
if err != nil {
|
||||
logger.Printf("Error recuperando tags pendientes: %v", err)
|
||||
time.Sleep(10 * time.Second)
|
||||
continue
|
||||
}
|
||||
|
||||
if len(tags) == 0 {
|
||||
logger.Printf("No hay tags pendientes. Durmiendo %d segundos...", sleepInterval)
|
||||
time.Sleep(time.Duration(sleepInterval) * time.Second)
|
||||
continue
|
||||
}
|
||||
|
||||
logger.Printf("Recuperados %d tags para procesar...", len(tags))
|
||||
|
||||
for _, tag := range tags {
|
||||
processTag(ctx, tag)
|
||||
time.Sleep(3 * time.Second) // Increased delay to avoid Wikipedia Rate Limits (429)
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue