:Release Notes: - :Detailed Notes: - :Testing Performed: - :QA Notes: as always AI generated :Issues Addressed: -
74 lines
3.2 KiB
Python
74 lines
3.2 KiB
Python
import logging
|
|
from typing import List, Optional
|
|
from playwright.async_api import async_playwright
|
|
from datetime import datetime
|
|
from urllib.parse import urljoin
|
|
|
|
from src.crawlers.base import ICrawler
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class PlaywrightCrawler(ICrawler):
|
|
def __init__(self, url: str, source: str, selector: Optional[str] = None):
|
|
self.url = url
|
|
self.source = source
|
|
self.selector = selector
|
|
|
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
page = await browser.new_page()
|
|
try:
|
|
try:
|
|
await page.goto(self.url, wait_until="networkidle", timeout=60000)
|
|
|
|
news_items = []
|
|
|
|
if self.selector:
|
|
elements = await page.query_selector_all(self.selector)
|
|
for el in elements:
|
|
# Try to find a link and title within the element
|
|
# If the element itself is an 'a' tag
|
|
if await el.evaluate("node => node.tagName === 'A'"):
|
|
link_el = el
|
|
else:
|
|
link_el = await el.query_selector('a')
|
|
|
|
if link_el:
|
|
title = await link_el.inner_text()
|
|
href = await link_el.get_attribute('href')
|
|
if href:
|
|
full_url = urljoin(self.url, href)
|
|
news_items.append(
|
|
NewsItemDTO(
|
|
title=title.strip(),
|
|
url=full_url,
|
|
content_text="",
|
|
source=self.source,
|
|
timestamp=datetime.now()
|
|
)
|
|
)
|
|
else:
|
|
# Fallback: extract h2 titles as a simple heuristic
|
|
elements = await page.query_selector_all('h2')
|
|
for el in elements:
|
|
title = await el.inner_text()
|
|
if title.strip():
|
|
news_items.append(
|
|
NewsItemDTO(
|
|
title=title.strip(),
|
|
url=self.url,
|
|
content_text="",
|
|
source=self.source,
|
|
timestamp=datetime.now()
|
|
)
|
|
)
|
|
|
|
return news_items
|
|
except Exception as e:
|
|
logger.error(f"Error crawling {self.url}: {e}")
|
|
return []
|
|
finally:
|
|
await browser.close()
|