AI-Trend-Scout/tests/crawlers/test_scholar_crawler.py
Artur Mukhamadiev 6d2ac9d0f0 Feature: Filter out sources older than 5 years in Google Scholar Crawler
:Release Notes:
- Updated the Google Scholar crawler to automatically filter out results older than 5 years to ensure recent content.

:Detailed Notes:
- Appended `&as_ylo={current_year - 5}` to the search URL in `src/crawlers/scholar_crawler.py` by dynamically calculating the current year via Python's `datetime`.
- Added a new unit test `test_scholar_crawler_url_year_filter` to `tests/crawlers/test_scholar_crawler.py` to verify URL construction.

:Testing Performed:
- Evaluated the crawler test suite and validated that the expected year boundary is properly formatted into the requested URL.
- All 91 automated pytest cases complete successfully.

:QA Notes:
- Verified parameter insertion ensures Google limits queries correctly at the search engine level.

:Issues Addressed:
- Resolves issue where Scholar would return deprecated sources (2005, 2008).

Change-Id: I56ae2fd7369d61494d17520238c3ef66e14436c7
2026-03-19 14:57:33 +03:00

127 lines
4.8 KiB
Python

import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from datetime import datetime, timezone
from src.crawlers.scholar_crawler import ScholarCrawler
from src.crawlers.dto import NewsItemDTO
@pytest.mark.asyncio
async def test_scholar_crawler_fetch_latest():
query = "Large Language Models"
source = "Google Scholar"
crawler = ScholarCrawler(query=query, source=source)
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
mock_stealth = MagicMock()
mock_stealth.apply_stealth_async = AsyncMock()
mock_stealth_class.return_value = mock_stealth
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
# Mock content to avoid CAPTCHA detection in crawler
mock_page.content.return_value = "<html><body>Results</body></html>"
# Setup mock results
mock_res = AsyncMock()
# Title element
mock_title_el = AsyncMock()
mock_title_el.inner_text.return_value = "LLM Paper Title"
mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2401.00001"
mock_res.query_selector.side_effect = lambda selector: {
".gs_rt a": mock_title_el,
".gs_rs": AsyncMock(inner_text=AsyncMock(return_value="This is a snippet")),
".gs_a": AsyncMock(inner_text=AsyncMock(return_value="Authors et al.")),
}.get(selector)
# Citations
mock_citation_link = AsyncMock()
mock_citation_link.inner_text.return_value = "Cited by 123"
mock_res.query_selector_all.return_value = [mock_citation_link]
mock_page.query_selector_all.return_value = [mock_res]
items = await crawler.fetch_latest()
assert len(items) == 1
assert items[0].title == "LLM Paper Title"
@pytest.mark.asyncio
async def test_scholar_crawler_no_title():
crawler = ScholarCrawler()
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
mock_page.content.return_value = "<html><body>Results</body></html>"
# Result item without title link
mock_res = AsyncMock()
mock_res.query_selector.return_value = None
mock_page.query_selector_all.return_value = [mock_res]
items = await crawler.fetch_latest()
assert len(items) == 0
@pytest.mark.asyncio
async def test_scholar_crawler_exception():
crawler = ScholarCrawler()
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
# Force exception
mock_browser.new_context.side_effect = Exception("Browser error")
items = await crawler.fetch_latest()
assert items == []
@pytest.mark.asyncio
async def test_scholar_crawler_captcha():
crawler = ScholarCrawler()
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
mock_p = AsyncMock()
mock_playwright.return_value.__aenter__.return_value = mock_p
mock_browser = AsyncMock()
mock_p.chromium.launch.return_value = mock_browser
mock_context = AsyncMock()
mock_browser.new_context.return_value = mock_context
mock_page = AsyncMock()
mock_context.new_page.return_value = mock_page
# Simulate CAPTCHA
mock_page.content.return_value = "<html><body>Please solve this CAPTCHA</body></html>"
items = await crawler.fetch_latest()
assert items == []
@pytest.mark.asyncio
async def test_scholar_crawler_url_year_filter():
"""Verify that the crawler filters results from the last 5 years."""
current_year = datetime.now().year
expected_year = current_year - 5
query = "Edge AI"
crawler = ScholarCrawler(query=query)
# The URL should include the lower year bound filter
assert f"&as_ylo={expected_year}" in crawler.url