diff --git a/src/crawlers.yml b/src/crawlers.yml index 85449d4..6706598 100644 --- a/src/crawlers.yml +++ b/src/crawlers.yml @@ -12,10 +12,10 @@ crawlers: url: "https://cvpr.thecvf.com/Conferences/2025" source: "CVPR 2025" selector: ".conference-news-item" - - type: playwright + - type: static url: "https://www.ces.tech/discover/?type=Article%2CSuccess+Story%2CPodcast&sort=desc&topics=Artificial+Intelligence%2CContent+and+Entertainment%2CAccessibility%2CInnovation+For+All" source: "CES 2025" - selector: ".press-release-item" + selector: "h3" - type: rss url: "https://vc.ru/rss/tag/tech" source: "VC.ru Tech" @@ -49,7 +49,7 @@ crawlers: - type: cppconf url: "https://cppconf.ru/en/talks/" source: "C++ Russia" - - type: playwright + - type: static url: "https://2025.ieee-icra.org/media/" source: "ICRA 2025" selector: "h4" @@ -65,25 +65,23 @@ crawlers: url: "https://www.hannovermesse.de/en/news/news-articles/" source: "Hannover Messe" selector: ".news-card" - - type: playwright + - type: static url: "https://rscf.ru/en/news/" source: "RSF" selector: ".news-item" - - type: playwright + - type: skolkovo url: "https://sk.ru/news/" source: "Skolkovo" - selector: ".news-list-item" - - type: playwright - url: "https://research-and-innovation.ec.europa.eu/news_en" + - type: rss + url: "https://research-and-innovation.ec.europa.eu/node/2/rss_en" source: "Horizon Europe" - selector: ".ecl-news-item" - type: rss url: "https://rb.ru/feeds/all/" source: "RB.ru" - type: rss url: "https://habr.com/ru/rss/all/all/?fl=ru" source: "Habr" - - type: playwright + - type: static url: "https://t.me/s/addmeto" source: "Telegram: Addmeto" selector: ".tgme_widget_message_text" diff --git a/src/crawlers/factory.py b/src/crawlers/factory.py index 07c92c8..f295dcd 100644 --- a/src/crawlers/factory.py +++ b/src/crawlers/factory.py @@ -5,6 +5,8 @@ from src.crawlers.base import ICrawler from src.crawlers.rss_crawler import RSSCrawler from src.crawlers.playwright_crawler import PlaywrightCrawler from src.crawlers.cppconf_crawler import CppConfCrawler +from src.crawlers.static_crawler import StaticCrawler +from src.crawlers.skolkovo_crawler import SkolkovoCrawler logger = logging.getLogger(__name__) @@ -39,6 +41,14 @@ class CrawlerFactory: crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector)) elif crawler_type == 'cppconf': crawlers.append(CppConfCrawler(url=url, source=source)) + elif crawler_type == 'static': + selector = item.get('selector') + if selector: + crawlers.append(StaticCrawler(url=url, source=source, selector=selector)) + else: + logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}") + elif crawler_type == 'skolkovo': + crawlers.append(SkolkovoCrawler(url=url, source=source)) else: logger.warning(f"Unknown crawler type: {crawler_type}") diff --git a/src/crawlers/skolkovo_crawler.py b/src/crawlers/skolkovo_crawler.py new file mode 100644 index 0000000..6f3f370 --- /dev/null +++ b/src/crawlers/skolkovo_crawler.py @@ -0,0 +1,66 @@ +import json +import re +import aiohttp +from datetime import datetime, timezone +from typing import List +from .base import ICrawler +from .dto import NewsItemDTO + +class SkolkovoCrawler(ICrawler): + def __init__(self, url: str, source: str = "Skolkovo"): + self.url = url + self.source = source + + async def fetch_latest(self) -> List[NewsItemDTO]: + async with aiohttp.ClientSession() as session: + try: + async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response: + if response.status != 200: + return [] + html = await response.text() + return self.parse_nextjs(html) + except Exception: + return [] + + def parse_nextjs(self, html: str) -> List[NewsItemDTO]: + match = re.search(r'', html) + if not match: + return [] + + try: + data = json.loads(match.group(1)) + news_data = data["props"]["pageProps"]["initialProps"]["homeStore"]["news"] + items_list = news_data.get("items", []) + except (KeyError, TypeError, json.JSONDecodeError): + return [] + + news_items = [] + for item in items_list: + title = item.get("title", "") + # Slug is used for URL + slug = item.get("slug", "") + url = f"https://sk.ru/news/{slug}/" if slug else self.url + + content_text = item.get("description", "") + # Clean up simple HTML if present + content_text = re.sub(r'<[^>]+>', ' ', content_text) + content_text = ' '.join(content_text.split()) + + # Timestamp + ts_str = item.get("published_at") or item.get("created_at") + timestamp = datetime.now(timezone.utc) + if ts_str: + try: + timestamp = datetime.fromisoformat(ts_str.replace("Z", "+00:00")) + except ValueError: + pass + + news_items.append(NewsItemDTO( + title=title, + url=url, + content_text=content_text, + source=self.source, + timestamp=timestamp + )) + + return news_items diff --git a/src/crawlers/static_crawler.py b/src/crawlers/static_crawler.py new file mode 100644 index 0000000..a83fedf --- /dev/null +++ b/src/crawlers/static_crawler.py @@ -0,0 +1,79 @@ +import asyncio +import aiohttp +import re +from typing import List +from datetime import datetime, timezone +from bs4 import BeautifulSoup +from .base import ICrawler +from .dto import NewsItemDTO + +class StaticCrawler(ICrawler): + def __init__(self, url: str, source: str, selector: str): + self.url = url + self.source = source + self.selector = selector + + async def fetch_latest(self) -> List[NewsItemDTO]: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + async with aiohttp.ClientSession(headers=headers) as session: + try: + async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response: + if response.status != 200: + return [] + html = await response.text() + return self.parse_html(html) + except Exception: + return [] + + def parse_html(self, html: str) -> List[NewsItemDTO]: + soup = BeautifulSoup(html, "html.parser") + items = [] + + elements = soup.select(self.selector) + for el in elements: + # Try to find a link and title + all_links = el.find_all('a') + link_el = None + title = "" + + # Find the first link that has text content + for a in all_links: + txt = a.get_text(strip=True) + if txt: + title = txt + link_el = a + break + + # If no link with text, just take the first link and look for title elsewhere + if not link_el and all_links: + link_el = all_links[0] + title_el = el.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']) + if title_el: + title = title_el.get_text(strip=True) + + if not link_el: + continue + + url = link_el.get('href') if link_el else "" + + if not title or not url: + continue + + # Normalize URL + if url.startswith('/'): + from urllib.parse import urljoin + url = urljoin(self.url, url) + + content_text = el.get_text(separator=" ", strip=True) + + items.append(NewsItemDTO( + title=title, + url=url, + content_text=content_text, + source=self.source, + timestamp=datetime.now(timezone.utc) + )) + + return items diff --git a/tests/crawlers/test_new_crawlers.py b/tests/crawlers/test_new_crawlers.py new file mode 100644 index 0000000..51eff7f --- /dev/null +++ b/tests/crawlers/test_new_crawlers.py @@ -0,0 +1,27 @@ +import pytest +import aiohttp +from src.crawlers.static_crawler import StaticCrawler +from src.crawlers.skolkovo_crawler import SkolkovoCrawler +from src.crawlers.dto import NewsItemDTO + +@pytest.mark.asyncio +async def test_static_crawler_addmeto(): + crawler = StaticCrawler(url="https://t.me/s/addmeto", source="Telegram: Addmeto", selector=".tgme_widget_message_text") + items = await crawler.fetch_latest() + assert len(items) > 0 + assert items[0].source == "Telegram: Addmeto" +@pytest.mark.asyncio +async def test_static_crawler_rsf(): + crawler = StaticCrawler(url="https://rscf.ru/en/news/", source="RSF", selector=".news-item") + items = await crawler.fetch_latest() + assert len(items) > 0 + assert items[0].source == "RSF" + assert "rscf.ru" in items[0].url + +@pytest.mark.asyncio +async def test_skolkovo_crawler(): + crawler = SkolkovoCrawler(url="https://sk.ru/news/", source="Skolkovo") + items = await crawler.fetch_latest() + assert len(items) > 0 + assert items[0].source == "Skolkovo" + assert "sk.ru" in items[0].url