import logging from typing import List, Optional from playwright.async_api import async_playwright from datetime import datetime from urllib.parse import urljoin from src.crawlers.base import ICrawler from src.crawlers.dto import NewsItemDTO logger = logging.getLogger(__name__) class PlaywrightCrawler(ICrawler): def __init__(self, url: str, source: str, selector: Optional[str] = None): self.url = url self.source = source self.selector = selector async def fetch_latest(self) -> List[NewsItemDTO]: async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: try: await page.goto(self.url, wait_until="networkidle", timeout=60000) news_items = [] if self.selector: elements = await page.query_selector_all(self.selector) for el in elements: # Try to find a link and title within the element # If the element itself is an 'a' tag if await el.evaluate("node => node.tagName === 'A'"): link_el = el else: link_el = await el.query_selector('a') if link_el: title = await link_el.inner_text() href = await link_el.get_attribute('href') if href: full_url = urljoin(self.url, href) news_items.append( NewsItemDTO( title=title.strip(), url=full_url, content_text="", source=self.source, timestamp=datetime.now() ) ) else: # Fallback: extract h2 titles as a simple heuristic elements = await page.query_selector_all('h2') for el in elements: title = await el.inner_text() if title.strip(): news_items.append( NewsItemDTO( title=title.strip(), url=self.url, content_text="", source=self.source, timestamp=datetime.now() ) ) return news_items except Exception as e: logger.error(f"Error crawling {self.url}: {e}") return [] finally: await browser.close()