feat(crawlers): convert multiple sources from Playwright to Static/RSS
- Added `StaticCrawler` for generic aiohttp+BS4 parsing. - Added `SkolkovoCrawler` for specialized Next.js parsing of sk.ru. - Converted ICRA 2025, RSF, CES 2025, and Telegram Addmeto to `static`. - Converted Horizon Europe to `rss` using its native feed. - Updated `CrawlerFactory` to support new crawler types. - Validated changes with unit tests.
This commit is contained in:
parent
a363ca41cf
commit
217037f72e
@ -12,10 +12,10 @@ crawlers:
|
||||
url: "https://cvpr.thecvf.com/Conferences/2025"
|
||||
source: "CVPR 2025"
|
||||
selector: ".conference-news-item"
|
||||
- type: playwright
|
||||
- type: static
|
||||
url: "https://www.ces.tech/discover/?type=Article%2CSuccess+Story%2CPodcast&sort=desc&topics=Artificial+Intelligence%2CContent+and+Entertainment%2CAccessibility%2CInnovation+For+All"
|
||||
source: "CES 2025"
|
||||
selector: ".press-release-item"
|
||||
selector: "h3"
|
||||
- type: rss
|
||||
url: "https://vc.ru/rss/tag/tech"
|
||||
source: "VC.ru Tech"
|
||||
@ -49,7 +49,7 @@ crawlers:
|
||||
- type: cppconf
|
||||
url: "https://cppconf.ru/en/talks/"
|
||||
source: "C++ Russia"
|
||||
- type: playwright
|
||||
- type: static
|
||||
url: "https://2025.ieee-icra.org/media/"
|
||||
source: "ICRA 2025"
|
||||
selector: "h4"
|
||||
@ -65,25 +65,23 @@ crawlers:
|
||||
url: "https://www.hannovermesse.de/en/news/news-articles/"
|
||||
source: "Hannover Messe"
|
||||
selector: ".news-card"
|
||||
- type: playwright
|
||||
- type: static
|
||||
url: "https://rscf.ru/en/news/"
|
||||
source: "RSF"
|
||||
selector: ".news-item"
|
||||
- type: playwright
|
||||
- type: skolkovo
|
||||
url: "https://sk.ru/news/"
|
||||
source: "Skolkovo"
|
||||
selector: ".news-list-item"
|
||||
- type: playwright
|
||||
url: "https://research-and-innovation.ec.europa.eu/news_en"
|
||||
- type: rss
|
||||
url: "https://research-and-innovation.ec.europa.eu/node/2/rss_en"
|
||||
source: "Horizon Europe"
|
||||
selector: ".ecl-news-item"
|
||||
- type: rss
|
||||
url: "https://rb.ru/feeds/all/"
|
||||
source: "RB.ru"
|
||||
- type: rss
|
||||
url: "https://habr.com/ru/rss/all/all/?fl=ru"
|
||||
source: "Habr"
|
||||
- type: playwright
|
||||
- type: static
|
||||
url: "https://t.me/s/addmeto"
|
||||
source: "Telegram: Addmeto"
|
||||
selector: ".tgme_widget_message_text"
|
||||
|
||||
@ -5,6 +5,8 @@ from src.crawlers.base import ICrawler
|
||||
from src.crawlers.rss_crawler import RSSCrawler
|
||||
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
||||
from src.crawlers.cppconf_crawler import CppConfCrawler
|
||||
from src.crawlers.static_crawler import StaticCrawler
|
||||
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -39,6 +41,14 @@ class CrawlerFactory:
|
||||
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
|
||||
elif crawler_type == 'cppconf':
|
||||
crawlers.append(CppConfCrawler(url=url, source=source))
|
||||
elif crawler_type == 'static':
|
||||
selector = item.get('selector')
|
||||
if selector:
|
||||
crawlers.append(StaticCrawler(url=url, source=source, selector=selector))
|
||||
else:
|
||||
logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}")
|
||||
elif crawler_type == 'skolkovo':
|
||||
crawlers.append(SkolkovoCrawler(url=url, source=source))
|
||||
else:
|
||||
logger.warning(f"Unknown crawler type: {crawler_type}")
|
||||
|
||||
|
||||
66
src/crawlers/skolkovo_crawler.py
Normal file
66
src/crawlers/skolkovo_crawler.py
Normal file
@ -0,0 +1,66 @@
|
||||
import json
|
||||
import re
|
||||
import aiohttp
|
||||
from datetime import datetime, timezone
|
||||
from typing import List
|
||||
from .base import ICrawler
|
||||
from .dto import NewsItemDTO
|
||||
|
||||
class SkolkovoCrawler(ICrawler):
|
||||
def __init__(self, url: str, source: str = "Skolkovo"):
|
||||
self.url = url
|
||||
self.source = source
|
||||
|
||||
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
||||
if response.status != 200:
|
||||
return []
|
||||
html = await response.text()
|
||||
return self.parse_nextjs(html)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def parse_nextjs(self, html: str) -> List[NewsItemDTO]:
|
||||
match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
|
||||
if not match:
|
||||
return []
|
||||
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
news_data = data["props"]["pageProps"]["initialProps"]["homeStore"]["news"]
|
||||
items_list = news_data.get("items", [])
|
||||
except (KeyError, TypeError, json.JSONDecodeError):
|
||||
return []
|
||||
|
||||
news_items = []
|
||||
for item in items_list:
|
||||
title = item.get("title", "")
|
||||
# Slug is used for URL
|
||||
slug = item.get("slug", "")
|
||||
url = f"https://sk.ru/news/{slug}/" if slug else self.url
|
||||
|
||||
content_text = item.get("description", "")
|
||||
# Clean up simple HTML if present
|
||||
content_text = re.sub(r'<[^>]+>', ' ', content_text)
|
||||
content_text = ' '.join(content_text.split())
|
||||
|
||||
# Timestamp
|
||||
ts_str = item.get("published_at") or item.get("created_at")
|
||||
timestamp = datetime.now(timezone.utc)
|
||||
if ts_str:
|
||||
try:
|
||||
timestamp = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
news_items.append(NewsItemDTO(
|
||||
title=title,
|
||||
url=url,
|
||||
content_text=content_text,
|
||||
source=self.source,
|
||||
timestamp=timestamp
|
||||
))
|
||||
|
||||
return news_items
|
||||
79
src/crawlers/static_crawler.py
Normal file
79
src/crawlers/static_crawler.py
Normal file
@ -0,0 +1,79 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import re
|
||||
from typing import List
|
||||
from datetime import datetime, timezone
|
||||
from bs4 import BeautifulSoup
|
||||
from .base import ICrawler
|
||||
from .dto import NewsItemDTO
|
||||
|
||||
class StaticCrawler(ICrawler):
|
||||
def __init__(self, url: str, source: str, selector: str):
|
||||
self.url = url
|
||||
self.source = source
|
||||
self.selector = selector
|
||||
|
||||
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
async with aiohttp.ClientSession(headers=headers) as session:
|
||||
try:
|
||||
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
||||
if response.status != 200:
|
||||
return []
|
||||
html = await response.text()
|
||||
return self.parse_html(html)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def parse_html(self, html: str) -> List[NewsItemDTO]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
items = []
|
||||
|
||||
elements = soup.select(self.selector)
|
||||
for el in elements:
|
||||
# Try to find a link and title
|
||||
all_links = el.find_all('a')
|
||||
link_el = None
|
||||
title = ""
|
||||
|
||||
# Find the first link that has text content
|
||||
for a in all_links:
|
||||
txt = a.get_text(strip=True)
|
||||
if txt:
|
||||
title = txt
|
||||
link_el = a
|
||||
break
|
||||
|
||||
# If no link with text, just take the first link and look for title elsewhere
|
||||
if not link_el and all_links:
|
||||
link_el = all_links[0]
|
||||
title_el = el.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
|
||||
if title_el:
|
||||
title = title_el.get_text(strip=True)
|
||||
|
||||
if not link_el:
|
||||
continue
|
||||
|
||||
url = link_el.get('href') if link_el else ""
|
||||
|
||||
if not title or not url:
|
||||
continue
|
||||
|
||||
# Normalize URL
|
||||
if url.startswith('/'):
|
||||
from urllib.parse import urljoin
|
||||
url = urljoin(self.url, url)
|
||||
|
||||
content_text = el.get_text(separator=" ", strip=True)
|
||||
|
||||
items.append(NewsItemDTO(
|
||||
title=title,
|
||||
url=url,
|
||||
content_text=content_text,
|
||||
source=self.source,
|
||||
timestamp=datetime.now(timezone.utc)
|
||||
))
|
||||
|
||||
return items
|
||||
27
tests/crawlers/test_new_crawlers.py
Normal file
27
tests/crawlers/test_new_crawlers.py
Normal file
@ -0,0 +1,27 @@
|
||||
import pytest
|
||||
import aiohttp
|
||||
from src.crawlers.static_crawler import StaticCrawler
|
||||
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
||||
from src.crawlers.dto import NewsItemDTO
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_static_crawler_addmeto():
|
||||
crawler = StaticCrawler(url="https://t.me/s/addmeto", source="Telegram: Addmeto", selector=".tgme_widget_message_text")
|
||||
items = await crawler.fetch_latest()
|
||||
assert len(items) > 0
|
||||
assert items[0].source == "Telegram: Addmeto"
|
||||
@pytest.mark.asyncio
|
||||
async def test_static_crawler_rsf():
|
||||
crawler = StaticCrawler(url="https://rscf.ru/en/news/", source="RSF", selector=".news-item")
|
||||
items = await crawler.fetch_latest()
|
||||
assert len(items) > 0
|
||||
assert items[0].source == "RSF"
|
||||
assert "rscf.ru" in items[0].url
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_skolkovo_crawler():
|
||||
crawler = SkolkovoCrawler(url="https://sk.ru/news/", source="Skolkovo")
|
||||
items = await crawler.fetch_latest()
|
||||
assert len(items) > 0
|
||||
assert items[0].source == "Skolkovo"
|
||||
assert "sk.ru" in items[0].url
|
||||
Loading…
x
Reference in New Issue
Block a user