feat(crawlers): convert multiple sources from Playwright to Static/RSS

- Added `StaticCrawler` for generic aiohttp+BS4 parsing. - Added `SkolkovoCrawler` for specialized Next.js parsing of sk.ru. - Converted ICRA 2025, RSF, CES 2025, and Telegram Addmeto to `static`. - Converted Horizon Europe to `rss` using its native feed. - Updated `CrawlerFactory` to support new crawler types. - Validated changes with unit tests.
2026-03-15 20:45:57 +03:00 · 2026-03-15 20:45:57 +03:00 · 217037f72e
commit 217037f72e
parent a363ca41cf
5 changed files with 190 additions and 10 deletions
--- a/src/crawlers.yml
+++ b/src/crawlers.yml
@ -12,10 +12,10 @@ crawlers:
    url: "https://cvpr.thecvf.com/Conferences/2025"
    source: "CVPR 2025"
    selector: ".conference-news-item"
-  - type: playwright
+  - type: static
    url: "https://www.ces.tech/discover/?type=Article%2CSuccess+Story%2CPodcast&sort=desc&topics=Artificial+Intelligence%2CContent+and+Entertainment%2CAccessibility%2CInnovation+For+All"
    source: "CES 2025"
-    selector: ".press-release-item"
+    selector: "h3"
  - type: rss
    url: "https://vc.ru/rss/tag/tech"
    source: "VC.ru Tech"
@ -49,7 +49,7 @@ crawlers:
  - type: cppconf
    url: "https://cppconf.ru/en/talks/"
    source: "C++ Russia"
-  - type: playwright
+  - type: static
    url: "https://2025.ieee-icra.org/media/"
    source: "ICRA 2025"
    selector: "h4"
@ -65,25 +65,23 @@ crawlers:
    url: "https://www.hannovermesse.de/en/news/news-articles/"
    source: "Hannover Messe"
    selector: ".news-card"
-  - type: playwright
+  - type: static
    url: "https://rscf.ru/en/news/"
    source: "RSF"
    selector: ".news-item"
-  - type: playwright
+  - type: skolkovo
    url: "https://sk.ru/news/"
    source: "Skolkovo"
-    selector: ".news-list-item"
+  - type: rss
-  - type: playwright
+    url: "https://research-and-innovation.ec.europa.eu/node/2/rss_en"
    url: "https://research-and-innovation.ec.europa.eu/news_en"
    source: "Horizon Europe"
    selector: ".ecl-news-item"
  - type: rss
    url: "https://rb.ru/feeds/all/"
    source: "RB.ru"
  - type: rss
    url: "https://habr.com/ru/rss/all/all/?fl=ru"
    source: "Habr"
-  - type: playwright
+  - type: static
    url: "https://t.me/s/addmeto"
    source: "Telegram: Addmeto"
    selector: ".tgme_widget_message_text"
--- a/src/crawlers/factory.py
+++ b/src/crawlers/factory.py
@ -5,6 +5,8 @@ from src.crawlers.base import ICrawler
 from src.crawlers.rss_crawler import RSSCrawler
 from src.crawlers.playwright_crawler import PlaywrightCrawler
 from src.crawlers.cppconf_crawler import CppConfCrawler
 from src.crawlers.static_crawler import StaticCrawler
 from src.crawlers.skolkovo_crawler import SkolkovoCrawler
 logger = logging.getLogger(__name__)
@ -39,6 +41,14 @@ class CrawlerFactory:
                    crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
                elif crawler_type == 'cppconf':
                    crawlers.append(CppConfCrawler(url=url, source=source))
                elif crawler_type == 'static':
                    selector = item.get('selector')
                    if selector:
                        crawlers.append(StaticCrawler(url=url, source=source, selector=selector))
                    else:
                        logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}")
                elif crawler_type == 'skolkovo':
                    crawlers.append(SkolkovoCrawler(url=url, source=source))
                else:
                    logger.warning(f"Unknown crawler type: {crawler_type}")
--- a/src/crawlers/skolkovo_crawler.py
+++ b/src/crawlers/skolkovo_crawler.py
@ -0,0 +1,66 @@
 import json
 import re
 import aiohttp
 from datetime import datetime, timezone
 from typing import List
 from .base import ICrawler
 from .dto import NewsItemDTO
 class SkolkovoCrawler(ICrawler):
    def __init__(self, url: str, source: str = "Skolkovo"):
        self.url = url
        self.source = source
    async def fetch_latest(self) -> List[NewsItemDTO]:
        async with aiohttp.ClientSession() as session:
            try:
                async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
                    if response.status != 200:
                        return []
                    html = await response.text()
                    return self.parse_nextjs(html)
            except Exception:
                return []
    def parse_nextjs(self, html: str) -> List[NewsItemDTO]:
        match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
        if not match:
            return []
        try:
            data = json.loads(match.group(1))
            news_data = data["props"]["pageProps"]["initialProps"]["homeStore"]["news"]
            items_list = news_data.get("items", [])
        except (KeyError, TypeError, json.JSONDecodeError):
            return []
        news_items = []
        for item in items_list:
            title = item.get("title", "")
            # Slug is used for URL
            slug = item.get("slug", "")
            url = f"https://sk.ru/news/{slug}/" if slug else self.url
            content_text = item.get("description", "")
            # Clean up simple HTML if present
            content_text = re.sub(r'<[^>]+>', ' ', content_text)
            content_text = ' '.join(content_text.split())
            # Timestamp
            ts_str = item.get("published_at") or item.get("created_at")
            timestamp = datetime.now(timezone.utc)
            if ts_str:
                try:
                    timestamp = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
                except ValueError:
                    pass
            news_items.append(NewsItemDTO(
                title=title,
                url=url,
                content_text=content_text,
                source=self.source,
                timestamp=timestamp
            ))
        return news_items
--- a/src/crawlers/static_crawler.py
+++ b/src/crawlers/static_crawler.py
@ -0,0 +1,79 @@
 import asyncio
 import aiohttp
 import re
 from typing import List
 from datetime import datetime, timezone
 from bs4 import BeautifulSoup
 from .base import ICrawler
 from .dto import NewsItemDTO
 class StaticCrawler(ICrawler):
    def __init__(self, url: str, source: str, selector: str):
        self.url = url
        self.source = source
        self.selector = selector
    async def fetch_latest(self) -> List[NewsItemDTO]:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        async with aiohttp.ClientSession(headers=headers) as session:
            try:
                async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
                    if response.status != 200:
                        return []
                    html = await response.text()
                    return self.parse_html(html)
            except Exception:
                return []
    def parse_html(self, html: str) -> List[NewsItemDTO]:
        soup = BeautifulSoup(html, "html.parser")
        items = []
        elements = soup.select(self.selector)
        for el in elements:
            # Try to find a link and title
            all_links = el.find_all('a')
            link_el = None
            title = ""
            # Find the first link that has text content
            for a in all_links:
                txt = a.get_text(strip=True)
                if txt:
                    title = txt
                    link_el = a
                    break
            # If no link with text, just take the first link and look for title elsewhere
            if not link_el and all_links:
                link_el = all_links[0]
                title_el = el.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
                if title_el:
                    title = title_el.get_text(strip=True)
            if not link_el:
                continue
            url = link_el.get('href') if link_el else ""
            if not title or not url:
                continue
            # Normalize URL
            if url.startswith('/'):
                from urllib.parse import urljoin
                url = urljoin(self.url, url)
            content_text = el.get_text(separator=" ", strip=True)
            items.append(NewsItemDTO(
                title=title,
                url=url,
                content_text=content_text,
                source=self.source,
                timestamp=datetime.now(timezone.utc)
            ))
        return items
--- a/tests/crawlers/test_new_crawlers.py
+++ b/tests/crawlers/test_new_crawlers.py
@ -0,0 +1,27 @@
 import pytest
 import aiohttp
 from src.crawlers.static_crawler import StaticCrawler
 from src.crawlers.skolkovo_crawler import SkolkovoCrawler
 from src.crawlers.dto import NewsItemDTO
@pytest.mark.asyncio
 async def test_static_crawler_addmeto():
    crawler = StaticCrawler(url="https://t.me/s/addmeto", source="Telegram: Addmeto", selector=".tgme_widget_message_text")
    items = await crawler.fetch_latest()
    assert len(items) > 0
    assert items[0].source == "Telegram: Addmeto"
@pytest.mark.asyncio
 async def test_static_crawler_rsf():
    crawler = StaticCrawler(url="https://rscf.ru/en/news/", source="RSF", selector=".news-item")
    items = await crawler.fetch_latest()
    assert len(items) > 0
    assert items[0].source == "RSF"
    assert "rscf.ru" in items[0].url
@pytest.mark.asyncio
 async def test_skolkovo_crawler():
    crawler = SkolkovoCrawler(url="https://sk.ru/news/", source="Skolkovo")
    items = await crawler.fetch_latest()
    assert len(items) > 0
    assert items[0].source == "Skolkovo"
    assert "sk.ru" in items[0].url