Update crawler selectors and add comprehensive tests
This commit is contained in:
parent
87af585e1b
commit
019d9161de
@ -50,38 +50,44 @@ crawlers:
|
|||||||
url: "https://blog.google/products-and-platforms/platforms/android/rss/"
|
url: "https://blog.google/products-and-platforms/platforms/android/rss/"
|
||||||
source: "Google Android Blog"
|
source: "Google Android Blog"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://cppconf.ru/"
|
url: "https://cppconf.ru/en/talks/"
|
||||||
source: "C++ Russia"
|
source: "C++ Russia"
|
||||||
selector: ".talks h3 a"
|
selector: "div.talk-item"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://www.icra2025.org/"
|
url: "https://2025.ieee-icra.org/media/"
|
||||||
source: "ICRA 2025"
|
source: "ICRA 2025"
|
||||||
selector: "h2 a"
|
selector: "h4"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://форумтехнопром.рф/"
|
url: "https://форумтехнопром.рф/"
|
||||||
source: "Technoprom-2025"
|
source: "Technoprom-2025"
|
||||||
selector: ".news-card a"
|
selector: ".news-item"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://innoprom.com/en/news/"
|
url: "https://www.innoprom.com/en/media/news/"
|
||||||
source: "INNOPROM-2025"
|
source: "INNOPROM-2025"
|
||||||
selector: ".news-item a"
|
selector: ".news-list__item"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://www.hannovermesse.de/en/press/press-releases/hannover-messe/"
|
url: "https://www.hannovermesse.de/en/news/news-articles/"
|
||||||
source: "Hannover Messe"
|
source: "Hannover Messe"
|
||||||
selector: ".media-item a"
|
selector: ".news-card"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://rscf.ru/en/news/"
|
url: "https://rscf.ru/en/news/"
|
||||||
source: "RSF"
|
source: "RSF"
|
||||||
selector: ".news-item a"
|
selector: ".news-item"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://sk.ru/news/"
|
url: "https://sk.ru/news/"
|
||||||
source: "Skolkovo"
|
source: "Skolkovo"
|
||||||
selector: ".news-item a"
|
selector: ".news-list-item"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://research-and-innovation.ec.europa.eu/news_en"
|
url: "https://research-and-innovation.ec.europa.eu/news_en"
|
||||||
source: "Horizon Europe"
|
source: "Horizon Europe"
|
||||||
selector: ".ecl-news-item a"
|
selector: ".ecl-news-item"
|
||||||
|
- type: rss
|
||||||
|
url: "https://rb.ru/feeds/all/"
|
||||||
|
source: "RB.ru"
|
||||||
|
- type: rss
|
||||||
|
url: "https://habr.com/ru/rss/all/all/?fl=ru"
|
||||||
|
source: "Habr"
|
||||||
- type: playwright
|
- type: playwright
|
||||||
url: "https://t.me/s/addmeto"
|
url: "https://t.me/s/addmeto"
|
||||||
source: "Addmeto"
|
source: "Telegram: Addmeto"
|
||||||
selector: ".tgme_widget_message_text"
|
selector: ".tgme_widget_message_text"
|
||||||
|
|||||||
109
tests/test_crawlers.py
Normal file
109
tests/test_crawlers.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
from unittest.mock import MagicMock, AsyncMock, patch
|
||||||
|
from src.crawlers.factory import CrawlerFactory
|
||||||
|
from src.crawlers.rss_crawler import RSSCrawler
|
||||||
|
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
def test_crawler_factory_load_real_file():
|
||||||
|
# Ensure the file exists
|
||||||
|
assert os.path.exists("src/crawlers.yml")
|
||||||
|
|
||||||
|
crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
|
||||||
|
assert len(crawlers) > 0
|
||||||
|
|
||||||
|
# Check if we have both types
|
||||||
|
types = [type(c) for c in crawlers]
|
||||||
|
assert RSSCrawler in types
|
||||||
|
assert PlaywrightCrawler in types
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_rss_crawler_fetch_latest():
|
||||||
|
rss_url = "https://example.com/rss"
|
||||||
|
source = "Test Source"
|
||||||
|
crawler = RSSCrawler(rss_url, source)
|
||||||
|
|
||||||
|
mock_xml = """<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>RSS Title</title>
|
||||||
|
<item>
|
||||||
|
<title>Test News</title>
|
||||||
|
<link>https://example.com/news1</link>
|
||||||
|
<description>Test Description</description>
|
||||||
|
<pubDate>Mon, 01 Jan 2024 00:00:00 +0000</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
"""
|
||||||
|
|
||||||
|
with patch("aiohttp.ClientSession.get") as mock_get:
|
||||||
|
mock_response = AsyncMock()
|
||||||
|
mock_response.status = 200
|
||||||
|
mock_response.text.return_value = mock_xml
|
||||||
|
mock_response.raise_for_status = MagicMock()
|
||||||
|
|
||||||
|
mock_get.return_value.__aenter__.return_value = mock_response
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].title == "Test News"
|
||||||
|
assert items[0].url == "https://example.com/news1"
|
||||||
|
assert items[0].source == source
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_playwright_crawler_fetch_latest():
|
||||||
|
url = "https://example.com/news"
|
||||||
|
source = "Test Playwright"
|
||||||
|
selector = ".news-item"
|
||||||
|
crawler = PlaywrightCrawler(url, source, selector)
|
||||||
|
|
||||||
|
with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright:
|
||||||
|
mock_p = AsyncMock()
|
||||||
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
||||||
|
|
||||||
|
mock_browser = AsyncMock()
|
||||||
|
mock_p.chromium.launch.return_value = mock_browser
|
||||||
|
|
||||||
|
mock_page = AsyncMock()
|
||||||
|
mock_browser.new_page.return_value = mock_page
|
||||||
|
|
||||||
|
mock_element = AsyncMock()
|
||||||
|
mock_element.evaluate.return_value = False # Not an 'a' tag
|
||||||
|
|
||||||
|
mock_link = AsyncMock()
|
||||||
|
mock_link.inner_text.return_value = "Test News"
|
||||||
|
mock_link.get_attribute.return_value = "/news1"
|
||||||
|
|
||||||
|
mock_element.query_selector.return_value = mock_link
|
||||||
|
mock_page.query_selector_all.return_value = [mock_element]
|
||||||
|
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].title == "Test News"
|
||||||
|
assert items[0].url == "https://example.com/news1"
|
||||||
|
assert items[0].source == source
|
||||||
|
|
||||||
|
def test_crawler_factory_invalid_config(tmp_path):
|
||||||
|
config_file = tmp_path / "invalid_crawlers.yml"
|
||||||
|
config_file.write_text("""
|
||||||
|
crawlers:
|
||||||
|
- type: unknown
|
||||||
|
url: "https://example.com"
|
||||||
|
source: "Unknown"
|
||||||
|
- type: rss
|
||||||
|
url: "https://example.com"
|
||||||
|
# missing source
|
||||||
|
- not_a_dict
|
||||||
|
""")
|
||||||
|
crawlers = CrawlerFactory.load_from_yaml(str(config_file))
|
||||||
|
assert len(crawlers) == 0
|
||||||
|
|
||||||
|
def test_crawler_factory_empty_file(tmp_path):
|
||||||
|
config_file = tmp_path / "empty.yml"
|
||||||
|
config_file.write_text("")
|
||||||
|
crawlers = CrawlerFactory.load_from_yaml(str(config_file))
|
||||||
|
assert len(crawlers) == 0
|
||||||
Loading…
x
Reference in New Issue
Block a user