From 9c31977e9897dfcb3ad73761603fe533c5f86e66 Mon Sep 17 00:00:00 2001 From: Artur Mukhamadiev Date: Sat, 14 Mar 2026 20:13:53 +0300 Subject: [PATCH] [feat] playwright crawler :Release Notes: - :Detailed Notes: - :Testing Performed: - :QA Notes: as always AI generated :Issues Addressed: - --- src/crawlers/playwright_crawler.py | 73 +++++++++++++++++ tests/crawlers/test_playwright_crawler.py | 99 +++++++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 src/crawlers/playwright_crawler.py create mode 100644 tests/crawlers/test_playwright_crawler.py diff --git a/src/crawlers/playwright_crawler.py b/src/crawlers/playwright_crawler.py new file mode 100644 index 0000000..ff25772 --- /dev/null +++ b/src/crawlers/playwright_crawler.py @@ -0,0 +1,73 @@ +import logging +from typing import List, Optional +from playwright.async_api import async_playwright +from datetime import datetime +from urllib.parse import urljoin + +from src.crawlers.base import ICrawler +from src.crawlers.dto import NewsItemDTO + +logger = logging.getLogger(__name__) + +class PlaywrightCrawler(ICrawler): + def __init__(self, url: str, source: str, selector: Optional[str] = None): + self.url = url + self.source = source + self.selector = selector + + async def fetch_latest(self) -> List[NewsItemDTO]: + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + try: + try: + await page.goto(self.url, wait_until="networkidle", timeout=60000) + + news_items = [] + + if self.selector: + elements = await page.query_selector_all(self.selector) + for el in elements: + # Try to find a link and title within the element + # If the element itself is an 'a' tag + if await el.evaluate("node => node.tagName === 'A'"): + link_el = el + else: + link_el = await el.query_selector('a') + + if link_el: + title = await link_el.inner_text() + href = await link_el.get_attribute('href') + if href: + full_url = urljoin(self.url, href) + news_items.append( + NewsItemDTO( + title=title.strip(), + url=full_url, + content_text="", + source=self.source, + timestamp=datetime.now() + ) + ) + else: + # Fallback: extract h2 titles as a simple heuristic + elements = await page.query_selector_all('h2') + for el in elements: + title = await el.inner_text() + if title.strip(): + news_items.append( + NewsItemDTO( + title=title.strip(), + url=self.url, + content_text="", + source=self.source, + timestamp=datetime.now() + ) + ) + + return news_items + except Exception as e: + logger.error(f"Error crawling {self.url}: {e}") + return [] + finally: + await browser.close() diff --git a/tests/crawlers/test_playwright_crawler.py b/tests/crawlers/test_playwright_crawler.py new file mode 100644 index 0000000..9a92a47 --- /dev/null +++ b/tests/crawlers/test_playwright_crawler.py @@ -0,0 +1,99 @@ +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from src.crawlers.playwright_crawler import PlaywrightCrawler +from src.crawlers.dto import NewsItemDTO + +@pytest.mark.asyncio +async def test_playwright_crawler_fetch_latest_with_selector(): + url = "https://example.com/news" + source = "ExampleSource" + selector = ".news-item" + + crawler = PlaywrightCrawler(url, source, selector) + + with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright: + # Mocking the async context manager chain + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + mock_page = AsyncMock() + mock_browser.new_page.return_value = mock_page + + # Setup mock elements + mock_element = AsyncMock() + mock_element.evaluate.return_value = False # Assume it's not an 'a' tag itself + + mock_link = AsyncMock() + mock_link.inner_text.return_value = "Test News Title" + mock_link.get_attribute.return_value = "/news/1" + + mock_element.query_selector.return_value = mock_link + mock_page.query_selector_all.return_value = [mock_element] + + results = await crawler.fetch_latest() + + assert len(results) == 1 + assert results[0].title == "Test News Title" + assert results[0].url == "https://example.com/news/1" + assert results[0].source == source + + mock_page.goto.assert_called_once_with(url, wait_until="networkidle", timeout=60000) + mock_browser.close.assert_called_once() + +@pytest.mark.asyncio +async def test_playwright_crawler_fetch_latest_no_selector(): + url = "https://example.com/blog" + source = "ExampleBlog" + + crawler = PlaywrightCrawler(url, source) + + with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright: + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + mock_page = AsyncMock() + mock_browser.new_page.return_value = mock_page + + # Setup mock elements for fallback (h2) + mock_h2 = AsyncMock() + mock_h2.inner_text.return_value = "Headline Title" + + mock_page.query_selector_all.return_value = [mock_h2] + + results = await crawler.fetch_latest() + + assert len(results) == 1 + assert results[0].title == "Headline Title" + assert results[0].url == url + assert results[0].source == source + +@pytest.mark.asyncio +async def test_playwright_crawler_fetch_latest_error(): + url = "https://example.com/error" + source = "ErrorSource" + + crawler = PlaywrightCrawler(url, source) + + with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright: + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + mock_page = AsyncMock() + mock_browser.new_page.return_value = mock_page + + # Simulate an error in page.goto + mock_page.goto.side_effect = Exception("Crawl failed") + + results = await crawler.fetch_latest() + + assert results == [] + mock_browser.close.assert_called_once()