- Added `StaticCrawler` for generic aiohttp+BS4 parsing. - Added `SkolkovoCrawler` for specialized Next.js parsing of sk.ru. - Converted ICRA 2025, RSF, CES 2025, and Telegram Addmeto to `static`. - Converted Horizon Europe to `rss` using its native feed. - Updated `CrawlerFactory` to support new crawler types. - Validated changes with unit tests.
28 lines
1.0 KiB
Python
28 lines
1.0 KiB
Python
import pytest
|
|
import aiohttp
|
|
from src.crawlers.static_crawler import StaticCrawler
|
|
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_static_crawler_addmeto():
|
|
crawler = StaticCrawler(url="https://t.me/s/addmeto", source="Telegram: Addmeto", selector=".tgme_widget_message_text")
|
|
items = await crawler.fetch_latest()
|
|
assert len(items) > 0
|
|
assert items[0].source == "Telegram: Addmeto"
|
|
@pytest.mark.asyncio
|
|
async def test_static_crawler_rsf():
|
|
crawler = StaticCrawler(url="https://rscf.ru/en/news/", source="RSF", selector=".news-item")
|
|
items = await crawler.fetch_latest()
|
|
assert len(items) > 0
|
|
assert items[0].source == "RSF"
|
|
assert "rscf.ru" in items[0].url
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_skolkovo_crawler():
|
|
crawler = SkolkovoCrawler(url="https://sk.ru/news/", source="Skolkovo")
|
|
items = await crawler.fetch_latest()
|
|
assert len(items) > 0
|
|
assert items[0].source == "Skolkovo"
|
|
assert "sk.ru" in items[0].url
|