diff --git a/src/crawlers.yml b/src/crawlers.yml index ccaf1d4..dd7a08f 100644 --- a/src/crawlers.yml +++ b/src/crawlers.yml @@ -50,38 +50,44 @@ crawlers: url: "https://blog.google/products-and-platforms/platforms/android/rss/" source: "Google Android Blog" - type: playwright - url: "https://cppconf.ru/" + url: "https://cppconf.ru/en/talks/" source: "C++ Russia" - selector: ".talks h3 a" + selector: "div.talk-item" - type: playwright - url: "https://www.icra2025.org/" + url: "https://2025.ieee-icra.org/media/" source: "ICRA 2025" - selector: "h2 a" + selector: "h4" - type: playwright url: "https://форумтехнопром.рф/" source: "Technoprom-2025" - selector: ".news-card a" + selector: ".news-item" - type: playwright - url: "https://innoprom.com/en/news/" + url: "https://www.innoprom.com/en/media/news/" source: "INNOPROM-2025" - selector: ".news-item a" + selector: ".news-list__item" - type: playwright - url: "https://www.hannovermesse.de/en/press/press-releases/hannover-messe/" + url: "https://www.hannovermesse.de/en/news/news-articles/" source: "Hannover Messe" - selector: ".media-item a" + selector: ".news-card" - type: playwright url: "https://rscf.ru/en/news/" source: "RSF" - selector: ".news-item a" + selector: ".news-item" - type: playwright url: "https://sk.ru/news/" source: "Skolkovo" - selector: ".news-item a" + selector: ".news-list-item" - type: playwright url: "https://research-and-innovation.ec.europa.eu/news_en" source: "Horizon Europe" - selector: ".ecl-news-item a" + selector: ".ecl-news-item" + - type: rss + url: "https://rb.ru/feeds/all/" + source: "RB.ru" + - type: rss + url: "https://habr.com/ru/rss/all/all/?fl=ru" + source: "Habr" - type: playwright url: "https://t.me/s/addmeto" - source: "Addmeto" + source: "Telegram: Addmeto" selector: ".tgme_widget_message_text" diff --git a/tests/test_crawlers.py b/tests/test_crawlers.py new file mode 100644 index 0000000..cf77aa8 --- /dev/null +++ b/tests/test_crawlers.py @@ -0,0 +1,109 @@ +import pytest +import os +from unittest.mock import MagicMock, AsyncMock, patch +from src.crawlers.factory import CrawlerFactory +from src.crawlers.rss_crawler import RSSCrawler +from src.crawlers.playwright_crawler import PlaywrightCrawler +from src.crawlers.dto import NewsItemDTO + +def test_crawler_factory_load_real_file(): + # Ensure the file exists + assert os.path.exists("src/crawlers.yml") + + crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml") + assert len(crawlers) > 0 + + # Check if we have both types + types = [type(c) for c in crawlers] + assert RSSCrawler in types + assert PlaywrightCrawler in types + +@pytest.mark.asyncio +async def test_rss_crawler_fetch_latest(): + rss_url = "https://example.com/rss" + source = "Test Source" + crawler = RSSCrawler(rss_url, source) + + mock_xml = """ + + + RSS Title + + Test News + https://example.com/news1 + Test Description + Mon, 01 Jan 2024 00:00:00 +0000 + + + +""" + + with patch("aiohttp.ClientSession.get") as mock_get: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text.return_value = mock_xml + mock_response.raise_for_status = MagicMock() + + mock_get.return_value.__aenter__.return_value = mock_response + + items = await crawler.fetch_latest() + + assert len(items) == 1 + assert items[0].title == "Test News" + assert items[0].url == "https://example.com/news1" + assert items[0].source == source + +@pytest.mark.asyncio +async def test_playwright_crawler_fetch_latest(): + url = "https://example.com/news" + source = "Test Playwright" + selector = ".news-item" + crawler = PlaywrightCrawler(url, source, selector) + + with patch("src.crawlers.playwright_crawler.async_playwright") as mock_playwright: + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + mock_page = AsyncMock() + mock_browser.new_page.return_value = mock_page + + mock_element = AsyncMock() + mock_element.evaluate.return_value = False # Not an 'a' tag + + mock_link = AsyncMock() + mock_link.inner_text.return_value = "Test News" + mock_link.get_attribute.return_value = "/news1" + + mock_element.query_selector.return_value = mock_link + mock_page.query_selector_all.return_value = [mock_element] + + items = await crawler.fetch_latest() + + assert len(items) == 1 + assert items[0].title == "Test News" + assert items[0].url == "https://example.com/news1" + assert items[0].source == source + +def test_crawler_factory_invalid_config(tmp_path): + config_file = tmp_path / "invalid_crawlers.yml" + config_file.write_text(""" +crawlers: + - type: unknown + url: "https://example.com" + source: "Unknown" + - type: rss + url: "https://example.com" + # missing source + - not_a_dict +""") + crawlers = CrawlerFactory.load_from_yaml(str(config_file)) + assert len(crawlers) == 0 + +def test_crawler_factory_empty_file(tmp_path): + config_file = tmp_path / "empty.yml" + config_file.write_text("") + crawlers = CrawlerFactory.load_from_yaml(str(config_file)) + assert len(crawlers) == 0