AI-Trend-Scout/src/crawlers/skolkovo_crawler.py
Artur Mukhamadiev 217037f72e feat(crawlers): convert multiple sources from Playwright to Static/RSS
- Added `StaticCrawler` for generic aiohttp+BS4 parsing.
- Added `SkolkovoCrawler` for specialized Next.js parsing of sk.ru.
- Converted ICRA 2025, RSF, CES 2025, and Telegram Addmeto to `static`.
- Converted Horizon Europe to `rss` using its native feed.
- Updated `CrawlerFactory` to support new crawler types.
- Validated changes with unit tests.
2026-03-15 21:21:14 +03:00

67 lines
2.3 KiB
Python

import json
import re
import aiohttp
from datetime import datetime, timezone
from typing import List
from .base import ICrawler
from .dto import NewsItemDTO
class SkolkovoCrawler(ICrawler):
def __init__(self, url: str, source: str = "Skolkovo"):
self.url = url
self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]:
async with aiohttp.ClientSession() as session:
try:
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
if response.status != 200:
return []
html = await response.text()
return self.parse_nextjs(html)
except Exception:
return []
def parse_nextjs(self, html: str) -> List[NewsItemDTO]:
match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
if not match:
return []
try:
data = json.loads(match.group(1))
news_data = data["props"]["pageProps"]["initialProps"]["homeStore"]["news"]
items_list = news_data.get("items", [])
except (KeyError, TypeError, json.JSONDecodeError):
return []
news_items = []
for item in items_list:
title = item.get("title", "")
# Slug is used for URL
slug = item.get("slug", "")
url = f"https://sk.ru/news/{slug}/" if slug else self.url
content_text = item.get("description", "")
# Clean up simple HTML if present
content_text = re.sub(r'<[^>]+>', ' ', content_text)
content_text = ' '.join(content_text.split())
# Timestamp
ts_str = item.get("published_at") or item.get("created_at")
timestamp = datetime.now(timezone.utc)
if ts_str:
try:
timestamp = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
except ValueError:
pass
news_items.append(NewsItemDTO(
title=title,
url=url,
content_text=content_text,
source=self.source,
timestamp=timestamp
))
return news_items