From a304ae9cd261d4f088c57bcd97cf2b4461b78da7 Mon Sep 17 00:00:00 2001 From: Artur Mukhamadiev Date: Mon, 16 Mar 2026 00:11:15 +0300 Subject: [PATCH] feat(crawler): add academic and research sources - Implement crawlers for Microsoft Research, SciRate, and Google Scholar - Use Playwright with stealth for Google Scholar anti-bot mitigation - Update CrawlerFactory to support new research crawler types - Add unit and integration tests for all academic sources with high coverage --- src/crawlers.yml | 16 +- src/crawlers/factory.py | 12 +- src/crawlers/microsoft_research_crawler.py | 7 + src/crawlers/scholar_crawler.py | 92 ++++++++ src/crawlers/scirate_crawler.py | 65 ++++++ src/crawlers/static_crawler.py | 8 +- tests/crawlers/test_academic_crawlers.py | 216 ++++++++++++++++++ tests/crawlers/test_factory.py | 32 ++- .../test_microsoft_research_crawler.py | 38 +++ tests/crawlers/test_scholar_crawler.py | 115 ++++++++++ tests/crawlers/test_scirate_crawler.py | 90 ++++++++ 11 files changed, 677 insertions(+), 14 deletions(-) create mode 100644 src/crawlers/microsoft_research_crawler.py create mode 100644 src/crawlers/scholar_crawler.py create mode 100644 src/crawlers/scirate_crawler.py create mode 100644 tests/crawlers/test_academic_crawlers.py create mode 100644 tests/crawlers/test_microsoft_research_crawler.py create mode 100644 tests/crawlers/test_scholar_crawler.py create mode 100644 tests/crawlers/test_scirate_crawler.py diff --git a/src/crawlers.yml b/src/crawlers.yml index 6706598..6bcadbc 100644 --- a/src/crawlers.yml +++ b/src/crawlers.yml @@ -93,4 +93,18 @@ crawlers: source: "Habr Code Quality" - type: rss url: "https://habr.com/ru/rss/articles/rated100/?fl=ru" - source: "Habr High Ranked" \ No newline at end of file + source: "Habr High Ranked" + - type: rss + url: "https://www.microsoft.com/en-us/research/feed/" + source: "Microsoft Research" + - type: scirate + url: "https://scirate.com/" + source: "SciRate" + - type: scholar + url: "https://scholar.google.com/" + source: "Google Scholar" + query: "WebGPU machine learning" + - type: scholar + url: "https://scholar.google.com/" + source: "Google Scholar" + query: "NPU acceleration" \ No newline at end of file diff --git a/src/crawlers/factory.py b/src/crawlers/factory.py index f295dcd..67285fa 100644 --- a/src/crawlers/factory.py +++ b/src/crawlers/factory.py @@ -7,6 +7,9 @@ from src.crawlers.playwright_crawler import PlaywrightCrawler from src.crawlers.cppconf_crawler import CppConfCrawler from src.crawlers.static_crawler import StaticCrawler from src.crawlers.skolkovo_crawler import SkolkovoCrawler +from src.crawlers.scirate_crawler import SciRateCrawler +from src.crawlers.scholar_crawler import ScholarCrawler +from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler logger = logging.getLogger(__name__) @@ -30,7 +33,7 @@ class CrawlerFactory: url = item.get('url') source = item.get('source') - if not url or not source: + if not source or (not url and crawler_type != 'scholar'): logger.warning(f"Missing mandatory fields (url, source) for crawler: {item}") continue @@ -49,6 +52,13 @@ class CrawlerFactory: logger.warning(f"Missing mandatory field 'selector' for static crawler: {item}") elif crawler_type == 'skolkovo': crawlers.append(SkolkovoCrawler(url=url, source=source)) + elif crawler_type == 'scirate': + crawlers.append(SciRateCrawler(url=url, source=source)) + elif crawler_type == 'scholar': + query = item.get('query', 'Artificial Intelligence') + crawlers.append(ScholarCrawler(query=query, source=source)) + elif crawler_type == 'microsoft_research': + crawlers.append(MicrosoftResearchCrawler(url=url, source=source)) else: logger.warning(f"Unknown crawler type: {crawler_type}") diff --git a/src/crawlers/microsoft_research_crawler.py b/src/crawlers/microsoft_research_crawler.py new file mode 100644 index 0000000..106aec1 --- /dev/null +++ b/src/crawlers/microsoft_research_crawler.py @@ -0,0 +1,7 @@ +from typing import List +from .rss_crawler import RSSCrawler +from .dto import NewsItemDTO + +class MicrosoftResearchCrawler(RSSCrawler): + def __init__(self, url: str = "https://www.microsoft.com/en-us/research/feed/", source: str = "Microsoft Research"): + super().__init__(url, source) diff --git a/src/crawlers/scholar_crawler.py b/src/crawlers/scholar_crawler.py new file mode 100644 index 0000000..a3104cc --- /dev/null +++ b/src/crawlers/scholar_crawler.py @@ -0,0 +1,92 @@ +import logging +from typing import List, Optional +from playwright.async_api import async_playwright +from playwright_stealth import Stealth +from datetime import datetime, timezone +from urllib.parse import urljoin + +from .base import ICrawler +from .dto import NewsItemDTO + +logger = logging.getLogger(__name__) + +class ScholarCrawler(ICrawler): + def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"): + self.query = query + # Google Scholar query URL + self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}" + self.source = source + + async def fetch_latest(self) -> List[NewsItemDTO]: + try: + async with async_playwright() as p: + # Launch browser + browser = await p.chromium.launch(headless=True) + try: + # Create a new context with a realistic user agent + context = await browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) + page = await context.new_page() + + # Apply stealth to avoid detection + await Stealth().apply_stealth_async(page) + + logger.info(f"Navigating to {self.url}") + await page.goto(self.url, wait_until="networkidle", timeout=60000) + + # Check for CAPTCHA or blocking + content = await page.content() + if "CAPTCHA" in content or "not a robot" in content: + logger.warning("Google Scholar CAPTCHA or bot detection triggered") + return [] + + # Select result items + results = await page.query_selector_all(".gs_ri") + news_items = [] + + for res in results: + # Title element + title_el = await res.query_selector(".gs_rt a") + if not title_el: + continue + + title = await title_el.inner_text() + url = await title_el.get_attribute("href") + + # Snippet/Abstract + snippet_el = await res.query_selector(".gs_rs") + snippet = await snippet_el.inner_text() if snippet_el else "" + + # Metadata (authors, journal, year) + metadata_el = await res.query_selector(".gs_a") + metadata = await metadata_el.inner_text() if metadata_el else "" + + # Citation count (usually in the bottom links) + # We look for a link that starts with "Cited by" + citation_count = "0" + bottom_links = await res.query_selector_all(".gs_fl a") + for link in bottom_links: + text = await link.inner_text() + if "Cited by" in text: + citation_count = text.replace("Cited by", "").strip() + break + + content_text = f"{metadata}\n\n{snippet}\n\nCitations: {citation_count}" + + news_items.append( + NewsItemDTO( + title=title.strip(), + url=url or self.url, + content_text=content_text.strip(), + source=f"{self.source}: {self.query}", + timestamp=datetime.now(timezone.utc) + ) + ) + + return news_items + finally: + await browser.close() + except Exception as e: + logger.error(f"Error crawling Google Scholar: {e}") + return [] diff --git a/src/crawlers/scirate_crawler.py b/src/crawlers/scirate_crawler.py new file mode 100644 index 0000000..f6a57c8 --- /dev/null +++ b/src/crawlers/scirate_crawler.py @@ -0,0 +1,65 @@ +import aiohttp +from datetime import datetime, timezone +from typing import List +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +from .base import ICrawler +from .dto import NewsItemDTO + +class SciRateCrawler(ICrawler): + def __init__(self, url: str = "https://scirate.com/", source: str = "SciRate"): + self.url = url + self.source = source + + async def fetch_latest(self) -> List[NewsItemDTO]: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + async with aiohttp.ClientSession(headers=headers) as session: + try: + async with session.get(self.url, timeout=aiohttp.ClientTimeout(total=30)) as response: + if response.status != 200: + return [] + html = await response.text() + return self.parse_html(html) + except Exception: + return [] + + def parse_html(self, html: str) -> List[NewsItemDTO]: + soup = BeautifulSoup(html, "html.parser") + items = [] + + # SciRate papers are typically in li.paper-list-item or div.paper + papers = soup.select("li.paper-list-item, div.paper") + + for paper in papers: + title_el = paper.select_one(".title a") + if not title_el: + continue + + title = title_el.get_text(strip=True) + link = title_el.get("href", "") + if isinstance(link, list): + link = link[0] if link else "" + + if link and link.startswith("/"): + link = urljoin(self.url, link) + + authors_el = paper.select_one(".authors") + authors = authors_el.get_text(strip=True) if authors_el else "" + + abstract_el = paper.select_one(".abstract") + abstract = abstract_el.get_text(strip=True) if abstract_el else "" + + content_text = f"Authors: {authors}\n\n{abstract}" + + items.append(NewsItemDTO( + title=title, + url=link or self.url, + content_text=content_text.strip(), + source=self.source, + timestamp=datetime.now(timezone.utc) + )) + + return items diff --git a/src/crawlers/static_crawler.py b/src/crawlers/static_crawler.py index a83fedf..f00ec03 100644 --- a/src/crawlers/static_crawler.py +++ b/src/crawlers/static_crawler.py @@ -57,20 +57,22 @@ class StaticCrawler(ICrawler): continue url = link_el.get('href') if link_el else "" + if isinstance(url, list): + url = url[0] if url else "" if not title or not url: continue # Normalize URL - if url.startswith('/'): + if str(url).startswith('/'): from urllib.parse import urljoin - url = urljoin(self.url, url) + url = urljoin(self.url, str(url)) content_text = el.get_text(separator=" ", strip=True) items.append(NewsItemDTO( title=title, - url=url, + url=str(url), content_text=content_text, source=self.source, timestamp=datetime.now(timezone.utc) diff --git a/tests/crawlers/test_academic_crawlers.py b/tests/crawlers/test_academic_crawlers.py new file mode 100644 index 0000000..9974def --- /dev/null +++ b/tests/crawlers/test_academic_crawlers.py @@ -0,0 +1,216 @@ +import pytest +import aiohttp +from unittest.mock import AsyncMock, patch, MagicMock +from datetime import datetime, timezone +from src.crawlers.scirate_crawler import SciRateCrawler +from src.crawlers.scholar_crawler import ScholarCrawler +from src.crawlers.factory import CrawlerFactory +from src.crawlers.dto import NewsItemDTO + +@pytest.mark.asyncio +async def test_scirate_crawler_parse_html(): + crawler = SciRateCrawler() + sample_html = """ +
  • + +
    John Doe, Jane Smith
    +
    We demonstrate quantum supremacy by perfectly boiling an egg.
    +
  • +
    + +
    Cat Lover
    +
    A deep learning approach to understanding meows.
    +
    + """ + + items = crawler.parse_html(sample_html) + + assert len(items) == 2 + + assert items[0].title == "Quantum Supremacy in the Kitchen" + assert "arxiv/2403.12345" in items[0].url + assert "John Doe, Jane Smith" in items[0].content_text + assert "boiling an egg" in items[0].content_text + assert items[0].source == "SciRate" + + assert items[1].title == "AI for Cats" + assert items[1].url == "https://scirate.com/arxiv/2403.67890" + assert "Cat Lover" in items[1].content_text + assert "meows" in items[1].content_text + +@pytest.mark.asyncio +async def test_scirate_crawler_fetch_latest(): + crawler = SciRateCrawler() + sample_html = """ +
  • + +
  • + """ + + with patch("aiohttp.ClientSession.get") as mock_get: + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.text.return_value = sample_html + + mock_get.return_value.__aenter__.return_value = mock_response + + items = await crawler.fetch_latest() + + assert len(items) == 1 + assert items[0].title == "Quantum Supremacy" + +@pytest.mark.asyncio +async def test_scirate_crawler_fetch_error(): + crawler = SciRateCrawler() + + with patch("aiohttp.ClientSession.get") as mock_get: + mock_response = AsyncMock() + mock_response.status = 404 + + mock_get.return_value.__aenter__.return_value = mock_response + + items = await crawler.fetch_latest() + + assert items == [] + +@pytest.mark.asyncio +async def test_scholar_crawler_fetch_latest(): + crawler = ScholarCrawler(query="WebGPU", source="Scholar") + + with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \ + patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class: + + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + mock_context = AsyncMock() + mock_browser.new_context.return_value = mock_context + + mock_page = AsyncMock() + mock_context.new_page.return_value = mock_page + mock_page.content.return_value = "Results" + + # Mock Stealth instance and method + mock_stealth_instance = MagicMock() + mock_stealth_instance.apply_stealth_async = AsyncMock() + mock_stealth_class.return_value = mock_stealth_instance + + # Mock result elements + mock_res = AsyncMock() + + mock_title_el = AsyncMock() + mock_title_el.inner_text.return_value = "WebGPU Accelerated ML" + mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2403.abc" + + mock_snippet_el = AsyncMock() + mock_snippet_el.inner_text.return_value = "This paper discusses WebGPU..." + + mock_metadata_el = AsyncMock() + mock_metadata_el.inner_text.return_value = "J. Smith, 2024 - arxiv.org" + + mock_citation_link = AsyncMock() + mock_citation_link.inner_text.return_value = "Cited by 15" + + mock_res.query_selector.side_effect = lambda selector: { + ".gs_rt a": mock_title_el, + ".gs_rs": mock_snippet_el, + ".gs_a": mock_metadata_el + }.get(selector) + + mock_res.query_selector_all.return_value = [mock_citation_link] + + mock_page.query_selector_all.return_value = [mock_res] + + items = await crawler.fetch_latest() + + assert len(items) == 1 + assert items[0].title == "WebGPU Accelerated ML" + assert items[0].url == "https://arxiv.org/abs/2403.abc" + assert "15" in items[0].content_text + assert "J. Smith, 2024" in items[0].content_text + assert items[0].source == "Scholar: WebGPU" + + mock_browser.close.assert_called_once() + +@pytest.mark.asyncio +async def test_scholar_crawler_captcha_detection(): + crawler = ScholarCrawler(query="WebGPU", source="Scholar") + + with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \ + patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class: + + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + mock_context = AsyncMock() + mock_browser.new_context.return_value = mock_context + + mock_page = AsyncMock() + mock_context.new_page.return_value = mock_page + + # Mock Stealth instance and method + mock_stealth_instance = MagicMock() + mock_stealth_instance.apply_stealth_async = AsyncMock() + mock_stealth_class.return_value = mock_stealth_instance + + # Simulate CAPTCHA in content + mock_page.content.return_value = "Please verify you are not a robot CAPTCHA" + + items = await crawler.fetch_latest() + + assert items == [] + mock_browser.close.assert_called_once() + +@pytest.mark.asyncio +async def test_scholar_crawler_error_handling(): + crawler = ScholarCrawler(query="WebGPU", source="Scholar") + + with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \ + patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class: + + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + mock_context = AsyncMock() + mock_browser.new_context.return_value = mock_context + + mock_page = AsyncMock() + mock_context.new_page.return_value = mock_page + + mock_stealth_instance = MagicMock() + mock_stealth_instance.apply_stealth_async = AsyncMock() + mock_stealth_class.return_value = mock_stealth_instance + + # Simulate exception during goto + mock_page.goto.side_effect = Exception("Browser crash") + + items = await crawler.fetch_latest() + + assert items == [] + mock_browser.close.assert_called_once() + +def test_factory_registration(): + # Test if SciRate and Scholar are registered in the factory + with patch("builtins.open", MagicMock()): + with patch("yaml.safe_load") as mock_yaml: + mock_yaml.return_value = { + 'crawlers': [ + {'type': 'scirate', 'url': 'https://scirate.com/', 'source': 'SciRate'}, + {'type': 'scholar', 'url': 'https://scholar.google.com/', 'source': 'Scholar', 'query': 'AI'} + ] + } + crawlers = CrawlerFactory.load_from_yaml("fake_path.yml") + + assert len(crawlers) == 2 + assert isinstance(crawlers[0], SciRateCrawler) + assert isinstance(crawlers[1], ScholarCrawler) + assert crawlers[1].query == 'AI' diff --git a/tests/crawlers/test_factory.py b/tests/crawlers/test_factory.py index bf91847..d29944d 100644 --- a/tests/crawlers/test_factory.py +++ b/tests/crawlers/test_factory.py @@ -4,6 +4,12 @@ from unittest.mock import patch, mock_open from src.crawlers.factory import CrawlerFactory from src.crawlers.rss_crawler import RSSCrawler from src.crawlers.playwright_crawler import PlaywrightCrawler +from src.crawlers.scirate_crawler import SciRateCrawler +from src.crawlers.scholar_crawler import ScholarCrawler +from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler +from src.crawlers.static_crawler import StaticCrawler +from src.crawlers.skolkovo_crawler import SkolkovoCrawler +from src.crawlers.cppconf_crawler import CppConfCrawler VALID_YAML = """ crawlers: @@ -14,6 +20,15 @@ crawlers: url: "https://example.com/playwright" source: "Example Playwright" selector: ".item" + - type: scirate + url: "https://scirate.com/" + source: "SciRate" + - type: scholar + query: "AI" + source: "Google Scholar" + - type: microsoft_research + url: "https://example.com/msr" + source: "Microsoft Research" """ INVALID_TYPE_YAML = """ @@ -45,15 +60,13 @@ def test_load_from_yaml_valid(): with patch("builtins.open", mock_open(read_data=VALID_YAML)): crawlers = CrawlerFactory.load_from_yaml("dummy.yml") - assert len(crawlers) == 2 + assert len(crawlers) == 5 assert isinstance(crawlers[0], RSSCrawler) - assert crawlers[0].url == "https://example.com/rss" - assert crawlers[0].source == "Example RSS" - assert isinstance(crawlers[1], PlaywrightCrawler) - assert crawlers[1].url == "https://example.com/playwright" - assert crawlers[1].source == "Example Playwright" - assert crawlers[1].selector == ".item" + assert isinstance(crawlers[2], SciRateCrawler) + assert isinstance(crawlers[3], ScholarCrawler) + assert isinstance(crawlers[4], MicrosoftResearchCrawler) + def test_load_from_yaml_unknown_type(): with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)): @@ -112,8 +125,9 @@ def test_integration_load_actual_config(): # Verify types and mandatory fields for all loaded crawlers for crawler in crawlers: - assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler)) - assert crawler.url.startswith("http") + assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler, StaticCrawler, SkolkovoCrawler, CppConfCrawler, SciRateCrawler, ScholarCrawler, MicrosoftResearchCrawler)) + if not isinstance(crawler, ScholarCrawler): + assert crawler.url.startswith("http") assert crawler.source if isinstance(crawler, PlaywrightCrawler): # According to src/crawlers.yml, all playwright crawlers currently have selectors diff --git a/tests/crawlers/test_microsoft_research_crawler.py b/tests/crawlers/test_microsoft_research_crawler.py new file mode 100644 index 0000000..80120c7 --- /dev/null +++ b/tests/crawlers/test_microsoft_research_crawler.py @@ -0,0 +1,38 @@ +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from datetime import datetime, timezone +from src.crawlers.microsoft_research_crawler import MicrosoftResearchCrawler +from src.crawlers.dto import NewsItemDTO + +MOCK_MSR_RSS = """ + + + Microsoft Research + + MSR Paper Title + https://www.microsoft.com/en-us/research/publication/msr-paper/ + MSR Paper Description + Mon, 10 Mar 2026 10:00:00 GMT + + + +""" + +@pytest.mark.asyncio +async def test_microsoft_research_crawler_fetch_latest(): + crawler = MicrosoftResearchCrawler() + + with patch("aiohttp.ClientSession.get") as mock_get: + mock_response = AsyncMock() + mock_response.text.return_value = MOCK_MSR_RSS + mock_response.status = 200 + mock_response.raise_for_status = MagicMock() + mock_get.return_value.__aenter__.return_value = mock_response + + items = await crawler.fetch_latest() + + assert len(items) == 1 + assert items[0].title == "MSR Paper Title" + assert items[0].url == "https://www.microsoft.com/en-us/research/publication/msr-paper/" + assert items[0].source == "Microsoft Research" + assert items[0].timestamp == datetime(2026, 3, 10, 10, 0, tzinfo=timezone.utc) diff --git a/tests/crawlers/test_scholar_crawler.py b/tests/crawlers/test_scholar_crawler.py new file mode 100644 index 0000000..f4a3e98 --- /dev/null +++ b/tests/crawlers/test_scholar_crawler.py @@ -0,0 +1,115 @@ +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from datetime import datetime, timezone +from src.crawlers.scholar_crawler import ScholarCrawler +from src.crawlers.dto import NewsItemDTO + +@pytest.mark.asyncio +async def test_scholar_crawler_fetch_latest(): + query = "Large Language Models" + source = "Google Scholar" + crawler = ScholarCrawler(query=query, source=source) + + with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \ + patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class: + + mock_stealth = MagicMock() + mock_stealth.apply_stealth_async = AsyncMock() + mock_stealth_class.return_value = mock_stealth + + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + mock_context = AsyncMock() + mock_browser.new_context.return_value = mock_context + + mock_page = AsyncMock() + mock_context.new_page.return_value = mock_page + + # Mock content to avoid CAPTCHA detection in crawler + mock_page.content.return_value = "Results" + + # Setup mock results + mock_res = AsyncMock() + + # Title element + mock_title_el = AsyncMock() + mock_title_el.inner_text.return_value = "LLM Paper Title" + mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2401.00001" + + mock_res.query_selector.side_effect = lambda selector: { + ".gs_rt a": mock_title_el, + ".gs_rs": AsyncMock(inner_text=AsyncMock(return_value="This is a snippet")), + ".gs_a": AsyncMock(inner_text=AsyncMock(return_value="Authors et al.")), + }.get(selector) + + # Citations + mock_citation_link = AsyncMock() + mock_citation_link.inner_text.return_value = "Cited by 123" + mock_res.query_selector_all.return_value = [mock_citation_link] + + mock_page.query_selector_all.return_value = [mock_res] + + items = await crawler.fetch_latest() + + assert len(items) == 1 + assert items[0].title == "LLM Paper Title" + +@pytest.mark.asyncio +async def test_scholar_crawler_no_title(): + crawler = ScholarCrawler() + with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright: + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + mock_context = AsyncMock() + mock_browser.new_context.return_value = mock_context + mock_page = AsyncMock() + mock_context.new_page.return_value = mock_page + mock_page.content.return_value = "Results" + + # Result item without title link + mock_res = AsyncMock() + mock_res.query_selector.return_value = None + mock_page.query_selector_all.return_value = [mock_res] + + items = await crawler.fetch_latest() + assert len(items) == 0 + +@pytest.mark.asyncio +async def test_scholar_crawler_exception(): + crawler = ScholarCrawler() + with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright: + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + + # Force exception + mock_browser.new_context.side_effect = Exception("Browser error") + + items = await crawler.fetch_latest() + assert items == [] + +@pytest.mark.asyncio +async def test_scholar_crawler_captcha(): + crawler = ScholarCrawler() + with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright: + mock_p = AsyncMock() + mock_playwright.return_value.__aenter__.return_value = mock_p + mock_browser = AsyncMock() + mock_p.chromium.launch.return_value = mock_browser + mock_context = AsyncMock() + mock_browser.new_context.return_value = mock_context + mock_page = AsyncMock() + mock_context.new_page.return_value = mock_page + + # Simulate CAPTCHA + mock_page.content.return_value = "Please solve this CAPTCHA" + + items = await crawler.fetch_latest() + assert items == [] diff --git a/tests/crawlers/test_scirate_crawler.py b/tests/crawlers/test_scirate_crawler.py new file mode 100644 index 0000000..a89d577 --- /dev/null +++ b/tests/crawlers/test_scirate_crawler.py @@ -0,0 +1,90 @@ +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from datetime import datetime, timezone +from src.crawlers.scirate_crawler import SciRateCrawler +from src.crawlers.dto import NewsItemDTO + +MOCK_SCIRATE_HTML = """ + + +
  • + +
    Vaswani et al.
    +
    This paper presents a new architecture...
    +
  • +
    + +
    Doe and Smith
    +
    Abstract of another paper.
    +
    + + +""" + +@pytest.mark.asyncio +async def test_scirate_crawler_fetch_latest(): + url = "https://scirate.com/" + source = "SciRate" + crawler = SciRateCrawler(url, source) + + # HTML with multiple items, one missing title, one with list-like link + mock_html = """ + + +
  • + +
  • +
  • +
    No link here
    +
  • +
  • + +
  • + + + """ + + with patch("aiohttp.ClientSession.get") as mock_get: + mock_response = AsyncMock() + mock_response.text.return_value = mock_html + mock_response.status = 200 + mock_get.return_value.__aenter__.return_value = mock_response + + # We also want to test the 'isinstance(link, list)' part. + # This is tricky because BS4 normally doesn't return a list for href. + # But we can mock title_el.get to return a list. + with patch("bs4.element.Tag.get", side_effect=[["/arxiv/list"], "/arxiv/3"]): + items = await crawler.fetch_latest() + + assert len(items) == 2 + assert items[0].url == "https://scirate.com/arxiv/list" + assert items[1].url == "https://scirate.com/arxiv/3" + +@pytest.mark.asyncio +async def test_scirate_crawler_exception(): + crawler = SciRateCrawler() + with patch("aiohttp.ClientSession.get") as mock_get: + mock_response = AsyncMock() + mock_response.text.return_value = "" + mock_response.status = 200 + mock_get.return_value.__aenter__.return_value = mock_response + + # Force an exception in parse_html + with patch.object(SciRateCrawler, 'parse_html', side_effect=Exception("Parsing failed")): + items = await crawler.fetch_latest() + assert items == [] + +@pytest.mark.asyncio +async def test_scirate_crawler_error(): + crawler = SciRateCrawler() + with patch("aiohttp.ClientSession.get") as mock_get: + mock_response = AsyncMock() + mock_response.status = 500 + mock_get.return_value.__aenter__.return_value = mock_response + + items = await crawler.fetch_latest() + assert items == []