coconews/backend/cmd/scraper/main.go

330 lines
7.4 KiB
Go

package main
import (
"context"
"crypto/md5"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/rss2/backend/internal/workers"
)
var (
logger *log.Logger
dbPool *workers.Config
pool *pgxpool.Pool
sleepInterval = 60
batchSize = 10
)
type URLSource struct {
ID int64
Nombre string
URL string
CategoriaID *int64
PaisID *int64
Idioma *string
Active bool
}
type Article struct {
Title string
Summary string
Content string
URL string
ImageURL string
PubDate *time.Time
}
func init() {
logger = log.New(os.Stdout, "[SCRAPER] ", log.LstdFlags)
logger.SetOutput(os.Stdout)
}
func loadConfig() {
sleepInterval = getEnvInt("SCRAPER_SLEEP", 60)
batchSize = getEnvInt("SCRAPER_BATCH", 10)
}
func getEnvInt(key string, defaultValue int) int {
if value := os.Getenv(key); value != "" {
if intVal, err := strconv.Atoi(value); err == nil {
return intVal
}
}
return defaultValue
}
func getActiveURLs(ctx context.Context) ([]URLSource, error) {
rows, err := pool.Query(ctx, `
SELECT id, nombre, url, categoria_id, pais_id, idioma, activo
FROM fuentes_url
WHERE activo = true
`)
if err != nil {
return nil, err
}
defer rows.Close()
var sources []URLSource
for rows.Next() {
var s URLSource
err := rows.Scan(&s.ID, &s.Nombre, &s.URL, &s.CategoriaID, &s.PaisID, &s.Idioma, &s.Active)
if err != nil {
continue
}
sources = append(sources, s)
}
return sources, nil
}
func updateSourceStatus(ctx context.Context, sourceID int64, status, message string, httpCode int) error {
_, err := pool.Exec(ctx, `
UPDATE fuentes_url
SET last_check = NOW(),
last_status = $1,
status_message = $2,
last_http_code = $3
WHERE id = $4
`, status, message, httpCode, sourceID)
return err
}
func extractArticle(source URLSource) (*Article, error) {
client := &http.Client{
Timeout: 30 * time.Second,
}
req, err := http.NewRequest("GET", source.URL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, err
}
article := &Article{
URL: source.URL,
}
// Extract title
article.Title = doc.Find("meta[property='og:title']").First().AttrOr("content", "")
if article.Title == "" {
article.Title = doc.Find("meta[name='title']").First().AttrOr("content", "")
}
if article.Title == "" {
article.Title = doc.Find("h1").First().Text()
}
if article.Title == "" {
article.Title = doc.Find("title").First().Text()
}
// Extract description/summary
article.Summary = doc.Find("meta[property='og:description']").First().AttrOr("content", "")
if article.Summary == "" {
article.Summary = doc.Find("meta[name='description']").First().AttrOr("content", "")
}
// Extract image
article.ImageURL = doc.Find("meta[property='og:image']").First().AttrOr("content", "")
// Extract main content - try common selectors
contentSelectors := []string{
"article",
"[role='main']",
"main",
".article-content",
".post-content",
".entry-content",
".content",
"#content",
}
for _, sel := range contentSelectors {
content := doc.Find(sel).First()
if content.Length() > 0 {
article.Content = content.Text()
break
}
}
// Clean up
article.Title = strings.TrimSpace(article.Title)
article.Summary = strings.TrimSpace(article.Summary)
article.Content = strings.TrimSpace(article.Content)
// Truncate summary if too long
if len(article.Summary) > 500 {
article.Summary = article.Summary[:500]
}
return article, nil
}
func saveArticle(ctx context.Context, source URLSource, article *Article) (bool, error) {
finalURL := article.URL
if finalURL == "" {
finalURL = source.URL
}
// Generate ID from URL
articleID := fmt.Sprintf("%x", md5.Sum([]byte(finalURL)))
// Check if exists
var exists bool
err := pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM noticias WHERE id = $1)", articleID).Scan(&exists)
if err != nil {
return false, err
}
if exists {
return false, nil
}
title := article.Title
if title == "" {
title = "Sin título"
}
summary := article.Summary
if summary == "" && article.Content != "" {
summary = article.Content
if len(summary) > 500 {
summary = summary[:500]
}
}
pubDate := time.Now()
if article.PubDate != nil {
pubDate = *article.PubDate
}
_, err = pool.Exec(ctx, `
INSERT INTO noticias (
id, titulo, resumen, url, fecha, imagen_url,
fuente_nombre, categoria_id, pais_id
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT (id) DO NOTHING
`, articleID, title, summary, finalURL, pubDate, article.ImageURL,
source.Nombre, source.CategoriaID, source.PaisID)
if err != nil {
return false, err
}
return true, nil
}
func processSource(ctx context.Context, source URLSource) {
logger.Printf("Processing: %s (%s)", source.Nombre, source.URL)
article, err := extractArticle(source)
if err != nil {
logger.Printf("Error extracting article from %s: %v", source.URL, err)
status := "ERROR"
if strings.Contains(err.Error(), "HTTP") {
status = "ERROR_HTTP"
}
updateSourceStatus(ctx, source.ID, status, err.Error()[:200], 0)
return
}
if article.Title == "" {
logger.Printf("No title found for %s", source.URL)
updateSourceStatus(ctx, source.ID, "ERROR_PARSE", "No title extracted", 200)
return
}
saved, err := saveArticle(ctx, source, article)
if err != nil {
logger.Printf("Error saving article: %v", err)
updateSourceStatus(ctx, source.ID, "ERROR_DB", err.Error()[:200], 0)
return
}
if saved {
logger.Printf("Saved: %s", article.Title)
updateSourceStatus(ctx, source.ID, "OK", "News created successfully", 200)
} else {
logger.Printf("Already exists: %s", article.Title)
updateSourceStatus(ctx, source.ID, "OK", "News already exists", 200)
}
}
func main() {
loadConfig()
logger.Println("Starting Scraper Worker")
cfg := workers.LoadDBConfig()
if err := workers.Connect(cfg); err != nil {
logger.Fatalf("Failed to connect to database: %v", err)
}
pool = workers.GetPool()
defer workers.Close()
logger.Println("Connected to PostgreSQL")
ctx := context.Background()
// Handle shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigChan
logger.Println("Shutting down...")
os.Exit(0)
}()
logger.Printf("Config: sleep=%ds, batch=%d", sleepInterval, batchSize)
ticker := time.NewTicker(time.Duration(sleepInterval) * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
sources, err := getActiveURLs(ctx)
if err != nil {
logger.Printf("Error fetching URLs: %v", err)
continue
}
if len(sources) == 0 {
logger.Println("No active URLs to process")
continue
}
logger.Printf("Processing %d sources", len(sources))
for _, source := range sources {
processSource(ctx, source)
time.Sleep(2 * time.Second) // Rate limiting
}
}
}
}