[feat] playwright crawler
:Release Notes: - :Detailed Notes: - :Testing Performed: - :QA Notes: as always AI generated :Issues Addressed: -
This commit is contained in:
parent
4bf7cb4331
commit
9c31977e98
73
src/crawlers/playwright_crawler.py
Normal file
73
src/crawlers/playwright_crawler.py
Normal file
@ -0,0 +1,73 @@
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from playwright.async_api import async_playwright
|
||||
from datetime import datetime
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from src.crawlers.base import ICrawler
|
||||
from src.crawlers.dto import NewsItemDTO
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PlaywrightCrawler(ICrawler):
|
||||
def __init__(self, url: str, source: str, selector: Optional[str] = None):
|
||||
self.url = url
|
||||
self.source = source
|
||||
self.selector = selector
|
||||
|
||||
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
try:
|
||||
try:
|
||||
await page.goto(self.url, wait_until="networkidle", timeout=60000)
|
||||
|
||||
news_items = []
|
||||
|
||||
if self.selector:
|
||||
elements = await page.query_selector_all(self.selector)
|
||||
for el in elements:
|
||||
# Try to find a link and title within the element
|
||||
# If the element itself is an 'a' tag
|
||||
if await el.evaluate("node => node.tagName === 'A'"):
|
||||
link_el = el
|
||||
else:
|
||||
link_el = await el.query_selector('a')
|
||||
|
||||
if link_el:
|
||||
title = await link_el.inner_text()
|
||||
href = await link_el.get_attribute('href')
|
||||
if href:
|
||||
full_url = urljoin(self.url, href)
|
||||
news_items.append(
|
||||
NewsItemDTO(
|
||||
title=title.strip(),
|
||||
url=full_url,
|
||||
content_text="",
|
||||
source=self.source,
|
||||
timestamp=datetime.now()
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Fallback: extract h2 titles as a simple heuristic
|
||||
elements = await page.query_selector_all('h2')
|
||||
for el in elements:
|
||||
title = await el.inner_text()
|
||||
if title.strip():
|
||||
news_items.append(
|
||||
NewsItemDTO(
|
||||
title=title.strip(),
|
||||
url=self.url,
|
||||
content_text="",
|
||||
source=self.source,
|
||||
timestamp=datetime.now()
|
||||
)
|
||||
)
|
||||
|
||||
return news_items
|
||||
except Exception as e:
|
||||
logger.error(f"Error crawling {self.url}: {e}")
|
||||
return []
|
||||
finally:
|
||||
await browser.close()
|
||||
99
tests/crawlers/test_playwright_crawler.py
Normal file
99
tests/crawlers/test_playwright_crawler.py
Normal file
@ -0,0 +1,99 @@
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
||||
from src.crawlers.dto import NewsItemDTO
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_playwright_crawler_fetch_latest_with_selector():
|
||||
url = "https://example.com/news"
|
||||
source = "ExampleSource"
|
||||
selector = ".news-item"
|
||||
|
||||
crawler = PlaywrightCrawler(url, source, selector)
|
||||
|
||||
with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright:
|
||||
# Mocking the async context manager chain
|
||||
mock_p = AsyncMock()
|
||||
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||
|
||||
mock_browser = AsyncMock()
|
||||
mock_p.chromium.launch.return_value = mock_browser
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_browser.new_page.return_value = mock_page
|
||||
|
||||
# Setup mock elements
|
||||
mock_element = AsyncMock()
|
||||
mock_element.evaluate.return_value = False # Assume it's not an 'a' tag itself
|
||||
|
||||
mock_link = AsyncMock()
|
||||
mock_link.inner_text.return_value = "Test News Title"
|
||||
mock_link.get_attribute.return_value = "/news/1"
|
||||
|
||||
mock_element.query_selector.return_value = mock_link
|
||||
mock_page.query_selector_all.return_value = [mock_element]
|
||||
|
||||
results = await crawler.fetch_latest()
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].title == "Test News Title"
|
||||
assert results[0].url == "https://example.com/news/1"
|
||||
assert results[0].source == source
|
||||
|
||||
mock_page.goto.assert_called_once_with(url, wait_until="networkidle", timeout=60000)
|
||||
mock_browser.close.assert_called_once()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_playwright_crawler_fetch_latest_no_selector():
|
||||
url = "https://example.com/blog"
|
||||
source = "ExampleBlog"
|
||||
|
||||
crawler = PlaywrightCrawler(url, source)
|
||||
|
||||
with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright:
|
||||
mock_p = AsyncMock()
|
||||
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||
|
||||
mock_browser = AsyncMock()
|
||||
mock_p.chromium.launch.return_value = mock_browser
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_browser.new_page.return_value = mock_page
|
||||
|
||||
# Setup mock elements for fallback (h2)
|
||||
mock_h2 = AsyncMock()
|
||||
mock_h2.inner_text.return_value = "Headline Title"
|
||||
|
||||
mock_page.query_selector_all.return_value = [mock_h2]
|
||||
|
||||
results = await crawler.fetch_latest()
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].title == "Headline Title"
|
||||
assert results[0].url == url
|
||||
assert results[0].source == source
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_playwright_crawler_fetch_latest_error():
|
||||
url = "https://example.com/error"
|
||||
source = "ErrorSource"
|
||||
|
||||
crawler = PlaywrightCrawler(url, source)
|
||||
|
||||
with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright:
|
||||
mock_p = AsyncMock()
|
||||
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||
|
||||
mock_browser = AsyncMock()
|
||||
mock_p.chromium.launch.return_value = mock_browser
|
||||
|
||||
mock_page = AsyncMock()
|
||||
mock_browser.new_page.return_value = mock_page
|
||||
|
||||
# Simulate an error in page.goto
|
||||
mock_page.goto.side_effect = Exception("Crawl failed")
|
||||
|
||||
results = await crawler.fetch_latest()
|
||||
|
||||
assert results == []
|
||||
mock_browser.close.assert_called_once()
|
||||
Loading…
x
Reference in New Issue
Block a user