feat(crawlers): convert multiple sources from Playwright to Static/RSS

- Added `StaticCrawler` for generic aiohttp+BS4 parsing. - Added `SkolkovoCrawler` for specialized Next.js parsing of sk.ru. - Converted ICRA 2025, RSF, CES 2025, and Telegram Addmeto to `static`. - Converted Horizon Europe to `rss` using its native feed. - Updated `CrawlerFactory` to support new crawler types. - Validated changes with unit tests.
2026-03-15 20:45:57 +03:00 · 2026-03-15 20:45:57 +03:00 · 217037f72e
commit 217037f72e
parent a363ca41cf
5 changed files with 190 additions and 10 deletions
--- a/src/crawlers.yml
+++ b/src/crawlers.yml
@ -12,10 +12,10 @@ crawlers:
    url: "https://cvpr.thecvf.com/Conferences/2025"
    source: "CVPR 2025"
    selector: ".conference-news-item"
-  - type: playwright
+  - type: static
    url: "https://www.ces.tech/discover/?type=Article%2CSuccess+Story%2CPodcast&sort=desc&topics=Artificial+Intelligence%2CContent+and+Entertainment%2CAccessibility%2CInnovation+For+All"
    source: "CES 2025"
-    selector: ".press-release-item"
+    selector: "h3"
  - type: rss
    url: "https://vc.ru/rss/tag/tech"
    source: "VC.ru Tech"
@ -49,7 +49,7 @@ crawlers:
  - type: cppconf
    url: "https://cppconf.ru/en/talks/"
    source: "C++ Russia"
-  - type: playwright
+  - type: static
    url: "https://2025.ieee-icra.org/media/"
    source: "ICRA 2025"
    selector: "h4"
@ -65,25 +65,23 @@ crawlers:
    url: "https://www.hannovermesse.de/en/news/news-articles/"
    source: "Hannover Messe"
    selector: ".news-card"
-  - type: playwright
+  - type: static
    url: "https://rscf.ru/en/news/"
    source: "RSF"
    selector: ".news-item"
-  - type: playwright
+  - type: skolkovo
    url: "https://sk.ru/news/"
    source: "Skolkovo"
-    selector: ".news-list-item"
-  - type: playwright
-    url: "https://research-and-innovation.ec.europa.eu/news_en"
+  - type: rss
+    url: "https://research-and-innovation.ec.europa.eu/node/2/rss_en"
    source: "Horizon Europe"
-    selector: ".ecl-news-item"
  - type: rss
    url: "https://rb.ru/feeds/all/"
    source: "RB.ru"
  - type: rss
    url: "https://habr.com/ru/rss/all/all/?fl=ru"
    source: "Habr"
-  - type: playwright
+  - type: static
    url: "https://t.me/s/addmeto"
    source: "Telegram: Addmeto"
    selector: ".tgme_widget_message_text"
--- a/src/crawlers/factory.py
+++ b/src/crawlers/factory.py
@ -5,6 +5,8 @@ from src.crawlers.base import ICrawler
 from src.crawlers.rss_crawler import RSSCrawler
 from src.crawlers.playwright_crawler import PlaywrightCrawler
 from src.crawlers.cppconf_crawler import CppConfCrawler
+from src.crawlers.static_crawler import StaticCrawler
+from src.crawlers.skolkovo_crawler import SkolkovoCrawler

 logger = logging.getLogger(__name__)

@ -39,6 +41,14 @@ class CrawlerFactory:
                    crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
                elif crawler_type == 'cppconf':
                    crawlers.append(CppConfCrawler(url=url, source=source))
+                elif crawler_type == 'static':
+                    selector = item.get('selector')
+                    if selector:
+                        crawlers.append(StaticCrawler(url=url, source=source, selector=selector))
+                    else:
+                        logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}")
+                elif crawler_type == 'skolkovo':
+                    crawlers.append(SkolkovoCrawler(url=url, source=source))
                else:
                    logger.warning(f"Unknown crawler type: {crawler_type}")
            
--- a/src/crawlers/skolkovo_crawler.py
+++ b/src/crawlers/skolkovo_crawler.py
@ -0,0 +1,66 @@
+import json
+import re
+import aiohttp
+from datetime import datetime, timezone
+from typing import List
+from .base import ICrawler
+from .dto import NewsItemDTO
+
+class SkolkovoCrawler(ICrawler):
+    def __init__(self, url: str, source: str = "Skolkovo"):
+        self.url = url
+        self.source = source
+
+    async def fetch_latest(self) -> List[NewsItemDTO]:
+        async with aiohttp.ClientSession() as session:
+            try:
+                async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
+                    if response.status != 200:
+                        return []
+                    html = await response.text()
+                    return self.parse_nextjs(html)
+            except Exception:
+                return []
+
+    def parse_nextjs(self, html: str) -> List[NewsItemDTO]:
+        match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
+        if not match:
+            return []
+            
+        try:
+            data = json.loads(match.group(1))
+            news_data = data["props"]["pageProps"]["initialProps"]["homeStore"]["news"]
+            items_list = news_data.get("items", [])
+        except (KeyError, TypeError, json.JSONDecodeError):
+            return []
+
+        news_items = []
+        for item in items_list:
+            title = item.get("title", "")
+            # Slug is used for URL
+            slug = item.get("slug", "")
+            url = f"https://sk.ru/news/{slug}/" if slug else self.url
+            
+            content_text = item.get("description", "")
+            # Clean up simple HTML if present
+            content_text = re.sub(r'<[^>]+>', ' ', content_text)
+            content_text = ' '.join(content_text.split())
+            
+            # Timestamp
+            ts_str = item.get("published_at") or item.get("created_at")
+            timestamp = datetime.now(timezone.utc)
+            if ts_str:
+                try:
+                    timestamp = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
+                except ValueError:
+                    pass
+            
+            news_items.append(NewsItemDTO(
+                title=title,
+                url=url,
+                content_text=content_text,
+                source=self.source,
+                timestamp=timestamp
+            ))
+            
+        return news_items
--- a/src/crawlers/static_crawler.py
+++ b/src/crawlers/static_crawler.py
@ -0,0 +1,79 @@
+import asyncio
+import aiohttp
+import re
+from typing import List
+from datetime import datetime, timezone
+from bs4 import BeautifulSoup
+from .base import ICrawler
+from .dto import NewsItemDTO
+
+class StaticCrawler(ICrawler):
+    def __init__(self, url: str, source: str, selector: str):
+        self.url = url
+        self.source = source
+        self.selector = selector
+
+    async def fetch_latest(self) -> List[NewsItemDTO]:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        async with aiohttp.ClientSession(headers=headers) as session:
+            try:
+                async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
+                    if response.status != 200:
+                        return []
+                    html = await response.text()
+                    return self.parse_html(html)
+            except Exception:
+                return []
+
+    def parse_html(self, html: str) -> List[NewsItemDTO]:
+        soup = BeautifulSoup(html, "html.parser")
+        items = []
+        
+        elements = soup.select(self.selector)
+        for el in elements:
+            # Try to find a link and title
+            all_links = el.find_all('a')
+            link_el = None
+            title = ""
+            
+            # Find the first link that has text content
+            for a in all_links:
+                txt = a.get_text(strip=True)
+                if txt:
+                    title = txt
+                    link_el = a
+                    break
+            
+            # If no link with text, just take the first link and look for title elsewhere
+            if not link_el and all_links:
+                link_el = all_links[0]
+                title_el = el.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
+                if title_el:
+                    title = title_el.get_text(strip=True)
+            
+            if not link_el:
+                continue
+                
+            url = link_el.get('href') if link_el else ""
+            
+            if not title or not url:
+                continue
+                
+            # Normalize URL
+            if url.startswith('/'):
+                from urllib.parse import urljoin
+                url = urljoin(self.url, url)
+            
+            content_text = el.get_text(separator=" ", strip=True)
+            
+            items.append(NewsItemDTO(
+                title=title,
+                url=url,
+                content_text=content_text,
+                source=self.source,
+                timestamp=datetime.now(timezone.utc)
+            ))
+            
+        return items
--- a/tests/crawlers/test_new_crawlers.py
+++ b/tests/crawlers/test_new_crawlers.py
@ -0,0 +1,27 @@
+import pytest
+import aiohttp
+from src.crawlers.static_crawler import StaticCrawler
+from src.crawlers.skolkovo_crawler import SkolkovoCrawler
+from src.crawlers.dto import NewsItemDTO
+
+@pytest.mark.asyncio
+async def test_static_crawler_addmeto():
+    crawler = StaticCrawler(url="https://t.me/s/addmeto", source="Telegram: Addmeto", selector=".tgme_widget_message_text")
+    items = await crawler.fetch_latest()
+    assert len(items) > 0
+    assert items[0].source == "Telegram: Addmeto"
+@pytest.mark.asyncio
+async def test_static_crawler_rsf():
+    crawler = StaticCrawler(url="https://rscf.ru/en/news/", source="RSF", selector=".news-item")
+    items = await crawler.fetch_latest()
+    assert len(items) > 0
+    assert items[0].source == "RSF"
+    assert "rscf.ru" in items[0].url
+
+@pytest.mark.asyncio
+async def test_skolkovo_crawler():
+    crawler = SkolkovoCrawler(url="https://sk.ru/news/", source="Skolkovo")
+    items = await crawler.fetch_latest()
+    assert len(items) > 0
+    assert items[0].source == "Skolkovo"
+    assert "sk.ru" in items[0].url