package main import ( "context" "crypto/md5" "fmt" "log" "net/http" "os" "os/signal" "strconv" "strings" "syscall" "time" "github.com/PuerkitoBio/goquery" "github.com/jackc/pgx/v5/pgxpool" "github.com/rss2/backend/internal/workers" ) var ( logger *log.Logger dbPool *workers.Config pool *pgxpool.Pool sleepInterval = 60 batchSize = 10 ) type URLSource struct { ID int64 Nombre string URL string CategoriaID *int64 PaisID *int64 Idioma *string Active bool } type Article struct { Title string Summary string Content string URL string ImageURL string PubDate *time.Time } func init() { logger = log.New(os.Stdout, "[SCRAPER] ", log.LstdFlags) logger.SetOutput(os.Stdout) } func loadConfig() { sleepInterval = getEnvInt("SCRAPER_SLEEP", 60) batchSize = getEnvInt("SCRAPER_BATCH", 10) } func getEnvInt(key string, defaultValue int) int { if value := os.Getenv(key); value != "" { if intVal, err := strconv.Atoi(value); err == nil { return intVal } } return defaultValue } func getActiveURLs(ctx context.Context) ([]URLSource, error) { rows, err := pool.Query(ctx, ` SELECT id, nombre, url, categoria_id, pais_id, idioma, activo FROM fuentes_url WHERE activo = true `) if err != nil { return nil, err } defer rows.Close() var sources []URLSource for rows.Next() { var s URLSource err := rows.Scan(&s.ID, &s.Nombre, &s.URL, &s.CategoriaID, &s.PaisID, &s.Idioma, &s.Active) if err != nil { continue } sources = append(sources, s) } return sources, nil } func updateSourceStatus(ctx context.Context, sourceID int64, status, message string, httpCode int) error { _, err := pool.Exec(ctx, ` UPDATE fuentes_url SET last_check = NOW(), last_status = $1, status_message = $2, last_http_code = $3 WHERE id = $4 `, status, message, httpCode, sourceID) return err } func extractArticle(source URLSource) (*Article, error) { client := &http.Client{ Timeout: 30 * time.Second, } req, err := http.NewRequest("GET", source.URL, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") req.Header.Set("Accept-Language", "en-US,en;q=0.5") resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, fmt.Errorf("HTTP %d", resp.StatusCode) } doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return nil, err } article := &Article{ URL: source.URL, } // Extract title article.Title = doc.Find("meta[property='og:title']").First().AttrOr("content", "") if article.Title == "" { article.Title = doc.Find("meta[name='title']").First().AttrOr("content", "") } if article.Title == "" { article.Title = doc.Find("h1").First().Text() } if article.Title == "" { article.Title = doc.Find("title").First().Text() } // Extract description/summary article.Summary = doc.Find("meta[property='og:description']").First().AttrOr("content", "") if article.Summary == "" { article.Summary = doc.Find("meta[name='description']").First().AttrOr("content", "") } // Extract image article.ImageURL = doc.Find("meta[property='og:image']").First().AttrOr("content", "") // Extract main content - try common selectors contentSelectors := []string{ "article", "[role='main']", "main", ".article-content", ".post-content", ".entry-content", ".content", "#content", } for _, sel := range contentSelectors { content := doc.Find(sel).First() if content.Length() > 0 { article.Content = content.Text() break } } // Clean up article.Title = strings.TrimSpace(article.Title) article.Summary = strings.TrimSpace(article.Summary) article.Content = strings.TrimSpace(article.Content) // Truncate summary if too long if len(article.Summary) > 500 { article.Summary = article.Summary[:500] } return article, nil } func saveArticle(ctx context.Context, source URLSource, article *Article) (bool, error) { finalURL := article.URL if finalURL == "" { finalURL = source.URL } // Generate ID from URL articleID := fmt.Sprintf("%x", md5.Sum([]byte(finalURL))) // Check if exists var exists bool err := pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM noticias WHERE id = $1)", articleID).Scan(&exists) if err != nil { return false, err } if exists { return false, nil } title := article.Title if title == "" { title = "Sin título" } summary := article.Summary if summary == "" && article.Content != "" { summary = article.Content if len(summary) > 500 { summary = summary[:500] } } pubDate := time.Now() if article.PubDate != nil { pubDate = *article.PubDate } _, err = pool.Exec(ctx, ` INSERT INTO noticias ( id, titulo, resumen, url, fecha, imagen_url, fuente_nombre, categoria_id, pais_id ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (id) DO NOTHING `, articleID, title, summary, finalURL, pubDate, article.ImageURL, source.Nombre, source.CategoriaID, source.PaisID) if err != nil { return false, err } return true, nil } func processSource(ctx context.Context, source URLSource) { logger.Printf("Processing: %s (%s)", source.Nombre, source.URL) article, err := extractArticle(source) if err != nil { logger.Printf("Error extracting article from %s: %v", source.URL, err) status := "ERROR" if strings.Contains(err.Error(), "HTTP") { status = "ERROR_HTTP" } updateSourceStatus(ctx, source.ID, status, err.Error()[:200], 0) return } if article.Title == "" { logger.Printf("No title found for %s", source.URL) updateSourceStatus(ctx, source.ID, "ERROR_PARSE", "No title extracted", 200) return } saved, err := saveArticle(ctx, source, article) if err != nil { logger.Printf("Error saving article: %v", err) updateSourceStatus(ctx, source.ID, "ERROR_DB", err.Error()[:200], 0) return } if saved { logger.Printf("Saved: %s", article.Title) updateSourceStatus(ctx, source.ID, "OK", "News created successfully", 200) } else { logger.Printf("Already exists: %s", article.Title) updateSourceStatus(ctx, source.ID, "OK", "News already exists", 200) } } func main() { loadConfig() logger.Println("Starting Scraper Worker") cfg := workers.LoadDBConfig() if err := workers.Connect(cfg); err != nil { logger.Fatalf("Failed to connect to database: %v", err) } pool = workers.GetPool() defer workers.Close() logger.Println("Connected to PostgreSQL") ctx := context.Background() // Handle shutdown sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) go func() { <-sigChan logger.Println("Shutting down...") os.Exit(0) }() logger.Printf("Config: sleep=%ds, batch=%d", sleepInterval, batchSize) ticker := time.NewTicker(time.Duration(sleepInterval) * time.Second) defer ticker.Stop() for { select { case <-ticker.C: sources, err := getActiveURLs(ctx) if err != nil { logger.Printf("Error fetching URLs: %v", err) continue } if len(sources) == 0 { logger.Println("No active URLs to process") continue } logger.Printf("Processing %d sources", len(sources)) for _, source := range sources { processSource(ctx, source) time.Sleep(2 * time.Second) // Rate limiting } } } }