go integration and wikipedia
This commit is contained in:
parent
47a252e339
commit
ee90335b92
7828 changed files with 1307913 additions and 20807 deletions
330
backend/cmd/scraper/main.go
Normal file
330
backend/cmd/scraper/main.go
Normal file
|
|
@ -0,0 +1,330 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
"github.com/rss2/backend/internal/workers"
|
||||
)
|
||||
|
||||
var (
|
||||
logger *log.Logger
|
||||
dbPool *workers.Config
|
||||
pool *pgxpool.Pool
|
||||
sleepInterval = 60
|
||||
batchSize = 10
|
||||
)
|
||||
|
||||
type URLSource struct {
|
||||
ID int64
|
||||
Nombre string
|
||||
URL string
|
||||
CategoriaID *int64
|
||||
PaisID *int64
|
||||
Idioma *string
|
||||
Active bool
|
||||
}
|
||||
|
||||
type Article struct {
|
||||
Title string
|
||||
Summary string
|
||||
Content string
|
||||
URL string
|
||||
ImageURL string
|
||||
PubDate *time.Time
|
||||
}
|
||||
|
||||
func init() {
|
||||
logger = log.New(os.Stdout, "[SCRAPER] ", log.LstdFlags)
|
||||
logger.SetOutput(os.Stdout)
|
||||
}
|
||||
|
||||
func loadConfig() {
|
||||
sleepInterval = getEnvInt("SCRAPER_SLEEP", 60)
|
||||
batchSize = getEnvInt("SCRAPER_BATCH", 10)
|
||||
}
|
||||
|
||||
func getEnvInt(key string, defaultValue int) int {
|
||||
if value := os.Getenv(key); value != "" {
|
||||
if intVal, err := strconv.Atoi(value); err == nil {
|
||||
return intVal
|
||||
}
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
func getActiveURLs(ctx context.Context) ([]URLSource, error) {
|
||||
rows, err := pool.Query(ctx, `
|
||||
SELECT id, nombre, url, categoria_id, pais_id, idioma, activo
|
||||
FROM fuentes_url
|
||||
WHERE activo = true
|
||||
`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var sources []URLSource
|
||||
for rows.Next() {
|
||||
var s URLSource
|
||||
err := rows.Scan(&s.ID, &s.Nombre, &s.URL, &s.CategoriaID, &s.PaisID, &s.Idioma, &s.Active)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sources = append(sources, s)
|
||||
}
|
||||
return sources, nil
|
||||
}
|
||||
|
||||
func updateSourceStatus(ctx context.Context, sourceID int64, status, message string, httpCode int) error {
|
||||
_, err := pool.Exec(ctx, `
|
||||
UPDATE fuentes_url
|
||||
SET last_check = NOW(),
|
||||
last_status = $1,
|
||||
status_message = $2,
|
||||
last_http_code = $3
|
||||
WHERE id = $4
|
||||
`, status, message, httpCode, sourceID)
|
||||
return err
|
||||
}
|
||||
|
||||
func extractArticle(source URLSource) (*Article, error) {
|
||||
client := &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("GET", source.URL, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
article := &Article{
|
||||
URL: source.URL,
|
||||
}
|
||||
|
||||
// Extract title
|
||||
article.Title = doc.Find("meta[property='og:title']").First().AttrOr("content", "")
|
||||
if article.Title == "" {
|
||||
article.Title = doc.Find("meta[name='title']").First().AttrOr("content", "")
|
||||
}
|
||||
if article.Title == "" {
|
||||
article.Title = doc.Find("h1").First().Text()
|
||||
}
|
||||
if article.Title == "" {
|
||||
article.Title = doc.Find("title").First().Text()
|
||||
}
|
||||
|
||||
// Extract description/summary
|
||||
article.Summary = doc.Find("meta[property='og:description']").First().AttrOr("content", "")
|
||||
if article.Summary == "" {
|
||||
article.Summary = doc.Find("meta[name='description']").First().AttrOr("content", "")
|
||||
}
|
||||
|
||||
// Extract image
|
||||
article.ImageURL = doc.Find("meta[property='og:image']").First().AttrOr("content", "")
|
||||
|
||||
// Extract main content - try common selectors
|
||||
contentSelectors := []string{
|
||||
"article",
|
||||
"[role='main']",
|
||||
"main",
|
||||
".article-content",
|
||||
".post-content",
|
||||
".entry-content",
|
||||
".content",
|
||||
"#content",
|
||||
}
|
||||
|
||||
for _, sel := range contentSelectors {
|
||||
content := doc.Find(sel).First()
|
||||
if content.Length() > 0 {
|
||||
article.Content = content.Text()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up
|
||||
article.Title = strings.TrimSpace(article.Title)
|
||||
article.Summary = strings.TrimSpace(article.Summary)
|
||||
article.Content = strings.TrimSpace(article.Content)
|
||||
|
||||
// Truncate summary if too long
|
||||
if len(article.Summary) > 500 {
|
||||
article.Summary = article.Summary[:500]
|
||||
}
|
||||
|
||||
return article, nil
|
||||
}
|
||||
|
||||
func saveArticle(ctx context.Context, source URLSource, article *Article) (bool, error) {
|
||||
finalURL := article.URL
|
||||
if finalURL == "" {
|
||||
finalURL = source.URL
|
||||
}
|
||||
|
||||
// Generate ID from URL
|
||||
articleID := fmt.Sprintf("%x", md5.Sum([]byte(finalURL)))
|
||||
|
||||
// Check if exists
|
||||
var exists bool
|
||||
err := pool.QueryRow(ctx, "SELECT EXISTS(SELECT 1 FROM noticias WHERE id = $1)", articleID).Scan(&exists)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if exists {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
title := article.Title
|
||||
if title == "" {
|
||||
title = "Sin título"
|
||||
}
|
||||
|
||||
summary := article.Summary
|
||||
if summary == "" && article.Content != "" {
|
||||
summary = article.Content
|
||||
if len(summary) > 500 {
|
||||
summary = summary[:500]
|
||||
}
|
||||
}
|
||||
|
||||
pubDate := time.Now()
|
||||
if article.PubDate != nil {
|
||||
pubDate = *article.PubDate
|
||||
}
|
||||
|
||||
_, err = pool.Exec(ctx, `
|
||||
INSERT INTO noticias (
|
||||
id, titulo, resumen, url, fecha, imagen_url,
|
||||
fuente_nombre, categoria_id, pais_id
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
ON CONFLICT (id) DO NOTHING
|
||||
`, articleID, title, summary, finalURL, pubDate, article.ImageURL,
|
||||
source.Nombre, source.CategoriaID, source.PaisID)
|
||||
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func processSource(ctx context.Context, source URLSource) {
|
||||
logger.Printf("Processing: %s (%s)", source.Nombre, source.URL)
|
||||
|
||||
article, err := extractArticle(source)
|
||||
if err != nil {
|
||||
logger.Printf("Error extracting article from %s: %v", source.URL, err)
|
||||
status := "ERROR"
|
||||
if strings.Contains(err.Error(), "HTTP") {
|
||||
status = "ERROR_HTTP"
|
||||
}
|
||||
updateSourceStatus(ctx, source.ID, status, err.Error()[:200], 0)
|
||||
return
|
||||
}
|
||||
|
||||
if article.Title == "" {
|
||||
logger.Printf("No title found for %s", source.URL)
|
||||
updateSourceStatus(ctx, source.ID, "ERROR_PARSE", "No title extracted", 200)
|
||||
return
|
||||
}
|
||||
|
||||
saved, err := saveArticle(ctx, source, article)
|
||||
if err != nil {
|
||||
logger.Printf("Error saving article: %v", err)
|
||||
updateSourceStatus(ctx, source.ID, "ERROR_DB", err.Error()[:200], 0)
|
||||
return
|
||||
}
|
||||
|
||||
if saved {
|
||||
logger.Printf("Saved: %s", article.Title)
|
||||
updateSourceStatus(ctx, source.ID, "OK", "News created successfully", 200)
|
||||
} else {
|
||||
logger.Printf("Already exists: %s", article.Title)
|
||||
updateSourceStatus(ctx, source.ID, "OK", "News already exists", 200)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
loadConfig()
|
||||
logger.Println("Starting Scraper Worker")
|
||||
|
||||
cfg := workers.LoadDBConfig()
|
||||
if err := workers.Connect(cfg); err != nil {
|
||||
logger.Fatalf("Failed to connect to database: %v", err)
|
||||
}
|
||||
pool = workers.GetPool()
|
||||
defer workers.Close()
|
||||
|
||||
logger.Println("Connected to PostgreSQL")
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Handle shutdown
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
<-sigChan
|
||||
logger.Println("Shutting down...")
|
||||
os.Exit(0)
|
||||
}()
|
||||
|
||||
logger.Printf("Config: sleep=%ds, batch=%d", sleepInterval, batchSize)
|
||||
|
||||
ticker := time.NewTicker(time.Duration(sleepInterval) * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
sources, err := getActiveURLs(ctx)
|
||||
if err != nil {
|
||||
logger.Printf("Error fetching URLs: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if len(sources) == 0 {
|
||||
logger.Println("No active URLs to process")
|
||||
continue
|
||||
}
|
||||
|
||||
logger.Printf("Processing %d sources", len(sources))
|
||||
|
||||
for _, source := range sources {
|
||||
processSource(ctx, source)
|
||||
time.Sleep(2 * time.Second) // Rate limiting
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue