feat(crawlers): convert multiple sources from Playwright to Static/RSS
- Added `StaticCrawler` for generic aiohttp+BS4 parsing. - Added `SkolkovoCrawler` for specialized Next.js parsing of sk.ru. - Converted ICRA 2025, RSF, CES 2025, and Telegram Addmeto to `static`. - Converted Horizon Europe to `rss` using its native feed. - Updated `CrawlerFactory` to support new crawler types. - Validated changes with unit tests.
This commit is contained in:
parent
a363ca41cf
commit
217037f72e
@ -12,10 +12,10 @@ crawlers:
|
|||||||
url: "https://cvpr.thecvf.com/Conferences/2025"
|
url: "https://cvpr.thecvf.com/Conferences/2025"
|
||||||
source: "CVPR 2025"
|
source: "CVPR 2025"
|
||||||
selector: ".conference-news-item"
|
selector: ".conference-news-item"
|
||||||
- type: playwright
|
- type: static
|
||||||
url: "https://www.ces.tech/discover/?type=Article%2CSuccess+Story%2CPodcast&sort=desc&topics=Artificial+Intelligence%2CContent+and+Entertainment%2CAccessibility%2CInnovation+For+All"
|
url: "https://www.ces.tech/discover/?type=Article%2CSuccess+Story%2CPodcast&sort=desc&topics=Artificial+Intelligence%2CContent+and+Entertainment%2CAccessibility%2CInnovation+For+All"
|
||||||
source: "CES 2025"
|
source: "CES 2025"
|
||||||
selector: ".press-release-item"
|
selector: "h3"
|
||||||
- type: rss
|
- type: rss
|
||||||
url: "https://vc.ru/rss/tag/tech"
|
url: "https://vc.ru/rss/tag/tech"
|
||||||
source: "VC.ru Tech"
|
source: "VC.ru Tech"
|
||||||
@ -49,7 +49,7 @@ crawlers:
|
|||||||
- type: cppconf
|
- type: cppconf
|
||||||
url: "https://cppconf.ru/en/talks/"
|
url: "https://cppconf.ru/en/talks/"
|
||||||
source: "C++ Russia"
|
source: "C++ Russia"
|
||||||
- type: playwright
|
- type: static
|
||||||
url: "https://2025.ieee-icra.org/media/"
|
url: "https://2025.ieee-icra.org/media/"
|
||||||
source: "ICRA 2025"
|
source: "ICRA 2025"
|
||||||
selector: "h4"
|
selector: "h4"
|
||||||
@ -65,25 +65,23 @@ crawlers:
|
|||||||
url: "https://www.hannovermesse.de/en/news/news-articles/"
|
url: "https://www.hannovermesse.de/en/news/news-articles/"
|
||||||
source: "Hannover Messe"
|
source: "Hannover Messe"
|
||||||
selector: ".news-card"
|
selector: ".news-card"
|
||||||
- type: playwright
|
- type: static
|
||||||
url: "https://rscf.ru/en/news/"
|
url: "https://rscf.ru/en/news/"
|
||||||
source: "RSF"
|
source: "RSF"
|
||||||
selector: ".news-item"
|
selector: ".news-item"
|
||||||
- type: playwright
|
- type: skolkovo
|
||||||
url: "https://sk.ru/news/"
|
url: "https://sk.ru/news/"
|
||||||
source: "Skolkovo"
|
source: "Skolkovo"
|
||||||
selector: ".news-list-item"
|
- type: rss
|
||||||
- type: playwright
|
url: "https://research-and-innovation.ec.europa.eu/node/2/rss_en"
|
||||||
url: "https://research-and-innovation.ec.europa.eu/news_en"
|
|
||||||
source: "Horizon Europe"
|
source: "Horizon Europe"
|
||||||
selector: ".ecl-news-item"
|
|
||||||
- type: rss
|
- type: rss
|
||||||
url: "https://rb.ru/feeds/all/"
|
url: "https://rb.ru/feeds/all/"
|
||||||
source: "RB.ru"
|
source: "RB.ru"
|
||||||
- type: rss
|
- type: rss
|
||||||
url: "https://habr.com/ru/rss/all/all/?fl=ru"
|
url: "https://habr.com/ru/rss/all/all/?fl=ru"
|
||||||
source: "Habr"
|
source: "Habr"
|
||||||
- type: playwright
|
- type: static
|
||||||
url: "https://t.me/s/addmeto"
|
url: "https://t.me/s/addmeto"
|
||||||
source: "Telegram: Addmeto"
|
source: "Telegram: Addmeto"
|
||||||
selector: ".tgme_widget_message_text"
|
selector: ".tgme_widget_message_text"
|
||||||
|
|||||||
@ -5,6 +5,8 @@ from src.crawlers.base import ICrawler
|
|||||||
from src.crawlers.rss_crawler import RSSCrawler
|
from src.crawlers.rss_crawler import RSSCrawler
|
||||||
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
from src.crawlers.playwright_crawler import PlaywrightCrawler
|
||||||
from src.crawlers.cppconf_crawler import CppConfCrawler
|
from src.crawlers.cppconf_crawler import CppConfCrawler
|
||||||
|
from src.crawlers.static_crawler import StaticCrawler
|
||||||
|
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -39,6 +41,14 @@ class CrawlerFactory:
|
|||||||
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
|
crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
|
||||||
elif crawler_type == 'cppconf':
|
elif crawler_type == 'cppconf':
|
||||||
crawlers.append(CppConfCrawler(url=url, source=source))
|
crawlers.append(CppConfCrawler(url=url, source=source))
|
||||||
|
elif crawler_type == 'static':
|
||||||
|
selector = item.get('selector')
|
||||||
|
if selector:
|
||||||
|
crawlers.append(StaticCrawler(url=url, source=source, selector=selector))
|
||||||
|
else:
|
||||||
|
logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}")
|
||||||
|
elif crawler_type == 'skolkovo':
|
||||||
|
crawlers.append(SkolkovoCrawler(url=url, source=source))
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown crawler type: {crawler_type}")
|
logger.warning(f"Unknown crawler type: {crawler_type}")
|
||||||
|
|
||||||
|
|||||||
66
src/crawlers/skolkovo_crawler.py
Normal file
66
src/crawlers/skolkovo_crawler.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
import json
|
||||||
|
import re
|
||||||
|
import aiohttp
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import List
|
||||||
|
from .base import ICrawler
|
||||||
|
from .dto import NewsItemDTO
|
||||||
|
|
||||||
|
class SkolkovoCrawler(ICrawler):
|
||||||
|
def __init__(self, url: str, source: str = "Skolkovo"):
|
||||||
|
self.url = url
|
||||||
|
self.source = source
|
||||||
|
|
||||||
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
try:
|
||||||
|
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
return []
|
||||||
|
html = await response.text()
|
||||||
|
return self.parse_nextjs(html)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def parse_nextjs(self, html: str) -> List[NewsItemDTO]:
|
||||||
|
match = re.search(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', html)
|
||||||
|
if not match:
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(match.group(1))
|
||||||
|
news_data = data["props"]["pageProps"]["initialProps"]["homeStore"]["news"]
|
||||||
|
items_list = news_data.get("items", [])
|
||||||
|
except (KeyError, TypeError, json.JSONDecodeError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
news_items = []
|
||||||
|
for item in items_list:
|
||||||
|
title = item.get("title", "")
|
||||||
|
# Slug is used for URL
|
||||||
|
slug = item.get("slug", "")
|
||||||
|
url = f"https://sk.ru/news/{slug}/" if slug else self.url
|
||||||
|
|
||||||
|
content_text = item.get("description", "")
|
||||||
|
# Clean up simple HTML if present
|
||||||
|
content_text = re.sub(r'<[^>]+>', ' ', content_text)
|
||||||
|
content_text = ' '.join(content_text.split())
|
||||||
|
|
||||||
|
# Timestamp
|
||||||
|
ts_str = item.get("published_at") or item.get("created_at")
|
||||||
|
timestamp = datetime.now(timezone.utc)
|
||||||
|
if ts_str:
|
||||||
|
try:
|
||||||
|
timestamp = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
news_items.append(NewsItemDTO(
|
||||||
|
title=title,
|
||||||
|
url=url,
|
||||||
|
content_text=content_text,
|
||||||
|
source=self.source,
|
||||||
|
timestamp=timestamp
|
||||||
|
))
|
||||||
|
|
||||||
|
return news_items
|
||||||
79
src/crawlers/static_crawler.py
Normal file
79
src/crawlers/static_crawler.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from .base import ICrawler
|
||||||
|
from .dto import NewsItemDTO
|
||||||
|
|
||||||
|
class StaticCrawler(ICrawler):
|
||||||
|
def __init__(self, url: str, source: str, selector: str):
|
||||||
|
self.url = url
|
||||||
|
self.source = source
|
||||||
|
self.selector = selector
|
||||||
|
|
||||||
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
async with aiohttp.ClientSession(headers=headers) as session:
|
||||||
|
try:
|
||||||
|
async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
return []
|
||||||
|
html = await response.text()
|
||||||
|
return self.parse_html(html)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def parse_html(self, html: str) -> List[NewsItemDTO]:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
items = []
|
||||||
|
|
||||||
|
elements = soup.select(self.selector)
|
||||||
|
for el in elements:
|
||||||
|
# Try to find a link and title
|
||||||
|
all_links = el.find_all('a')
|
||||||
|
link_el = None
|
||||||
|
title = ""
|
||||||
|
|
||||||
|
# Find the first link that has text content
|
||||||
|
for a in all_links:
|
||||||
|
txt = a.get_text(strip=True)
|
||||||
|
if txt:
|
||||||
|
title = txt
|
||||||
|
link_el = a
|
||||||
|
break
|
||||||
|
|
||||||
|
# If no link with text, just take the first link and look for title elsewhere
|
||||||
|
if not link_el and all_links:
|
||||||
|
link_el = all_links[0]
|
||||||
|
title_el = el.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
|
||||||
|
if title_el:
|
||||||
|
title = title_el.get_text(strip=True)
|
||||||
|
|
||||||
|
if not link_el:
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = link_el.get('href') if link_el else ""
|
||||||
|
|
||||||
|
if not title or not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize URL
|
||||||
|
if url.startswith('/'):
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
url = urljoin(self.url, url)
|
||||||
|
|
||||||
|
content_text = el.get_text(separator=" ", strip=True)
|
||||||
|
|
||||||
|
items.append(NewsItemDTO(
|
||||||
|
title=title,
|
||||||
|
url=url,
|
||||||
|
content_text=content_text,
|
||||||
|
source=self.source,
|
||||||
|
timestamp=datetime.now(timezone.utc)
|
||||||
|
))
|
||||||
|
|
||||||
|
return items
|
||||||
27
tests/crawlers/test_new_crawlers.py
Normal file
27
tests/crawlers/test_new_crawlers.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import pytest
|
||||||
|
import aiohttp
|
||||||
|
from src.crawlers.static_crawler import StaticCrawler
|
||||||
|
from src.crawlers.skolkovo_crawler import SkolkovoCrawler
|
||||||
|
from src.crawlers.dto import NewsItemDTO
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_static_crawler_addmeto():
|
||||||
|
crawler = StaticCrawler(url="https://t.me/s/addmeto", source="Telegram: Addmeto", selector=".tgme_widget_message_text")
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
assert len(items) > 0
|
||||||
|
assert items[0].source == "Telegram: Addmeto"
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_static_crawler_rsf():
|
||||||
|
crawler = StaticCrawler(url="https://rscf.ru/en/news/", source="RSF", selector=".news-item")
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
assert len(items) > 0
|
||||||
|
assert items[0].source == "RSF"
|
||||||
|
assert "rscf.ru" in items[0].url
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_skolkovo_crawler():
|
||||||
|
crawler = SkolkovoCrawler(url="https://sk.ru/news/", source="Skolkovo")
|
||||||
|
items = await crawler.fetch_latest()
|
||||||
|
assert len(items) > 0
|
||||||
|
assert items[0].source == "Skolkovo"
|
||||||
|
assert "sk.ru" in items[0].url
|
||||||
Loading…
x
Reference in New Issue
Block a user