AI-Trend-Scout/src/crawlers/playwright_crawler.py
Artur Mukhamadiev 9c31977e98 [feat] playwright crawler
:Release Notes:
-

:Detailed Notes:
-

:Testing Performed:
-

:QA Notes:
as always AI generated

:Issues Addressed:
-
2026-03-14 20:13:53 +03:00

74 lines
3.2 KiB
Python

import logging
from typing import List, Optional
from playwright.async_api import async_playwright
from datetime import datetime
from urllib.parse import urljoin
from src.crawlers.base import ICrawler
from src.crawlers.dto import NewsItemDTO
logger = logging.getLogger(__name__)
class PlaywrightCrawler(ICrawler):
def __init__(self, url: str, source: str, selector: Optional[str] = None):
self.url = url
self.source = source
self.selector = selector
async def fetch_latest(self) -> List[NewsItemDTO]:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
try:
await page.goto(self.url, wait_until="networkidle", timeout=60000)
news_items = []
if self.selector:
elements = await page.query_selector_all(self.selector)
for el in elements:
# Try to find a link and title within the element
# If the element itself is an 'a' tag
if await el.evaluate("node => node.tagName === 'A'"):
link_el = el
else:
link_el = await el.query_selector('a')
if link_el:
title = await link_el.inner_text()
href = await link_el.get_attribute('href')
if href:
full_url = urljoin(self.url, href)
news_items.append(
NewsItemDTO(
title=title.strip(),
url=full_url,
content_text="",
source=self.source,
timestamp=datetime.now()
)
)
else:
# Fallback: extract h2 titles as a simple heuristic
elements = await page.query_selector_all('h2')
for el in elements:
title = await el.inner_text()
if title.strip():
news_items.append(
NewsItemDTO(
title=title.strip(),
url=self.url,
content_text="",
source=self.source,
timestamp=datetime.now()
)
)
return news_items
except Exception as e:
logger.error(f"Error crawling {self.url}: {e}")
return []
finally:
await browser.close()