feat(crawlers): convert multiple sources from Playwright to Static/RSS

- Added `StaticCrawler` for generic aiohttp+BS4 parsing.
- Added `SkolkovoCrawler` for specialized Next.js parsing of sk.ru.
- Converted ICRA 2025, RSF, CES 2025, and Telegram Addmeto to `static`.
- Converted Horizon Europe to `rss` using its native feed.
- Updated `CrawlerFactory` to support new crawler types.
- Validated changes with unit tests.
This commit is contained in:
Artur Mukhamadiev 2026-03-15 20:45:57 +03:00
parent a363ca41cf
commit 217037f72e
5 changed files with 190 additions and 10 deletions

View File

@ -12,10 +12,10 @@ crawlers:
url: "https://cvpr.thecvf.com/Conferences/2025"
source: "CVPR 2025"
selector: ".conference-news-item"
- type: playwright
- type: static
url: "https://www.ces.tech/discover/?type=Article%2CSuccess+Story%2CPodcast&sort=desc&topics=Artificial+Intelligence%2CContent+and+Entertainment%2CAccessibility%2CInnovation+For+All"
source: "CES 2025"
selector: ".press-release-item"
selector: "h3"
- type: rss
url: "https://vc.ru/rss/tag/tech"
source: "VC.ru Tech"
@ -49,7 +49,7 @@ crawlers:
- type: cppconf
url: "https://cppconf.ru/en/talks/"
source: "C++ Russia"
- type: playwright
- type: static
url: "https://2025.ieee-icra.org/media/"
source: "ICRA 2025"
selector: "h4"
@ -65,25 +65,23 @@ crawlers:
url: "https://www.hannovermesse.de/en/news/news-articles/"
source: "Hannover Messe"
selector: ".news-card"
- type: playwright
- type: static
url: "https://rscf.ru/en/news/"
source: "RSF"
selector: ".news-item"
- type: playwright
- type: skolkovo
url: "https://sk.ru/news/"
source: "Skolkovo"
selector: ".news-list-item"
- type: playwright
url: "https://research-and-innovation.ec.europa.eu/news_en"
- type: rss
url: "https://research-and-innovation.ec.europa.eu/node/2/rss_en"
source: "Horizon Europe"
selector: ".ecl-news-item"
- type: rss
url: "https://rb.ru/feeds/all/"
source: "RB.ru"
- type: rss
url: "https://habr.com/ru/rss/all/all/?fl=ru"
source: "Habr"
- type: playwright
- type: static
url: "https://t.me/s/addmeto"
source: "Telegram: Addmeto"
selector: ".tgme_widget_message_text"

View File

@ -5,6 +5,8 @@ from src.crawlers.base import ICrawler
from src.crawlers.rss_crawler import RSSCrawler
from src.crawlers.playwright_crawler import PlaywrightCrawler
from src.crawlers.cppconf_crawler import CppConfCrawler
from src.crawlers.static_crawler import StaticCrawler
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
logger = logging.getLogger(__name__)
@ -39,6 +41,14 @@ class CrawlerFactory:
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
elif crawler_type == 'cppconf':
crawlers.append(CppConfCrawler(url=url, source=source))
elif crawler_type == 'static':
selector = item.get('selector')
if selector:
crawlers.append(StaticCrawler(url=url, source=source, selector=selector))
else:
logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}")
elif crawler_type == 'skolkovo':
crawlers.append(SkolkovoCrawler(url=url, source=source))
else:
logger.warning(f"Unknown crawler type: {crawler_type}")

View File

@ -0,0 +1,66 @@
import json
import re
import aiohttp
from datetime import datetime, timezone
from typing import List
from .base import ICrawler
from .dto import NewsItemDTO
class SkolkovoCrawler(ICrawler):
def __init__(self, url: str, source: str = "Skolkovo"):
self.url = url
self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]:
async with aiohttp.ClientSession() as session:
try:
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
if response.status != 200:
return []
html = await response.text()
return self.parse_nextjs(html)
except Exception:
return []
def parse_nextjs(self, html: str) -> List[NewsItemDTO]:
match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
if not match:
return []
try:
data = json.loads(match.group(1))
news_data = data["props"]["pageProps"]["initialProps"]["homeStore"]["news"]
items_list = news_data.get("items", [])
except (KeyError, TypeError, json.JSONDecodeError):
return []
news_items = []
for item in items_list:
title = item.get("title", "")
# Slug is used for URL
slug = item.get("slug", "")
url = f"https://sk.ru/news/{slug}/" if slug else self.url
content_text = item.get("description", "")
# Clean up simple HTML if present
content_text = re.sub(r'<[^>]+>', ' ', content_text)
content_text = ' '.join(content_text.split())
# Timestamp
ts_str = item.get("published_at") or item.get("created_at")
timestamp = datetime.now(timezone.utc)
if ts_str:
try:
timestamp = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
except ValueError:
pass
news_items.append(NewsItemDTO(
title=title,
url=url,
content_text=content_text,
source=self.source,
timestamp=timestamp
))
return news_items

View File

@ -0,0 +1,79 @@
import asyncio
import aiohttp
import re
from typing import List
from datetime import datetime, timezone
from bs4 import BeautifulSoup
from .base import ICrawler
from .dto import NewsItemDTO
class StaticCrawler(ICrawler):
def __init__(self, url: str, source: str, selector: str):
self.url = url
self.source = source
self.selector = selector
async def fetch_latest(self) -> List[NewsItemDTO]:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
async with aiohttp.ClientSession(headers=headers) as session:
try:
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
if response.status != 200:
return []
html = await response.text()
return self.parse_html(html)
except Exception:
return []
def parse_html(self, html: str) -> List[NewsItemDTO]:
soup = BeautifulSoup(html, "html.parser")
items = []
elements = soup.select(self.selector)
for el in elements:
# Try to find a link and title
all_links = el.find_all('a')
link_el = None
title = ""
# Find the first link that has text content
for a in all_links:
txt = a.get_text(strip=True)
if txt:
title = txt
link_el = a
break
# If no link with text, just take the first link and look for title elsewhere
if not link_el and all_links:
link_el = all_links[0]
title_el = el.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
if title_el:
title = title_el.get_text(strip=True)
if not link_el:
continue
url = link_el.get('href') if link_el else ""
if not title or not url:
continue
# Normalize URL
if url.startswith('/'):
from urllib.parse import urljoin
url = urljoin(self.url, url)
content_text = el.get_text(separator=" ", strip=True)
items.append(NewsItemDTO(
title=title,
url=url,
content_text=content_text,
source=self.source,
timestamp=datetime.now(timezone.utc)
))
return items

View File

@ -0,0 +1,27 @@
import pytest
import aiohttp
from src.crawlers.static_crawler import StaticCrawler
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
from src.crawlers.dto import NewsItemDTO
@pytest.mark.asyncio
async def test_static_crawler_addmeto():
crawler = StaticCrawler(url="https://t.me/s/addmeto", source="Telegram: Addmeto", selector=".tgme_widget_message_text")
items = await crawler.fetch_latest()
assert len(items) > 0
assert items[0].source == "Telegram: Addmeto"
@pytest.mark.asyncio
async def test_static_crawler_rsf():
crawler = StaticCrawler(url="https://rscf.ru/en/news/", source="RSF", selector=".news-item")
items = await crawler.fetch_latest()
assert len(items) > 0
assert items[0].source == "RSF"
assert "rscf.ru" in items[0].url
@pytest.mark.asyncio
async def test_skolkovo_crawler():
crawler = SkolkovoCrawler(url="https://sk.ru/news/", source="Skolkovo")
items = await crawler.fetch_latest()
assert len(items) > 0
assert items[0].source == "Skolkovo"
assert "sk.ru" in items[0].url