:Release Notes:
- Updated the Google Scholar crawler to automatically filter out results older than 5 years to ensure recent content.
:Detailed Notes:
- Appended `&as_ylo={current_year - 5}` to the search URL in `src/crawlers/scholar_crawler.py` by dynamically calculating the current year via Python's `datetime`.
- Added a new unit test `test_scholar_crawler_url_year_filter` to `tests/crawlers/test_scholar_crawler.py` to verify URL construction.
:Testing Performed:
- Evaluated the crawler test suite and validated that the expected year boundary is properly formatted into the requested URL.
- All 91 automated pytest cases complete successfully.
:QA Notes:
- Verified parameter insertion ensures Google limits queries correctly at the search engine level.
:Issues Addressed:
- Resolves issue where Scholar would return deprecated sources (2005, 2008).
Change-Id: I56ae2fd7369d61494d17520238c3ef66e14436c7
127 lines
4.8 KiB
Python
127 lines
4.8 KiB
Python
import pytest
|
|
from unittest.mock import AsyncMock, patch, MagicMock
|
|
from datetime import datetime, timezone
|
|
from src.crawlers.scholar_crawler import ScholarCrawler
|
|
from src.crawlers.dto import NewsItemDTO
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_fetch_latest():
|
|
query = "Large Language Models"
|
|
source = "Google Scholar"
|
|
crawler = ScholarCrawler(query=query, source=source)
|
|
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright, \
|
|
patch("src.crawlers.scholar_crawler.Stealth") as mock_stealth_class:
|
|
|
|
mock_stealth = MagicMock()
|
|
mock_stealth.apply_stealth_async = AsyncMock()
|
|
mock_stealth_class.return_value = mock_stealth
|
|
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
|
|
# Mock content to avoid CAPTCHA detection in crawler
|
|
mock_page.content.return_value = "<html><body>Results</body></html>"
|
|
|
|
# Setup mock results
|
|
mock_res = AsyncMock()
|
|
|
|
# Title element
|
|
mock_title_el = AsyncMock()
|
|
mock_title_el.inner_text.return_value = "LLM Paper Title"
|
|
mock_title_el.get_attribute.return_value = "https://arxiv.org/abs/2401.00001"
|
|
|
|
mock_res.query_selector.side_effect = lambda selector: {
|
|
".gs_rt a": mock_title_el,
|
|
".gs_rs": AsyncMock(inner_text=AsyncMock(return_value="This is a snippet")),
|
|
".gs_a": AsyncMock(inner_text=AsyncMock(return_value="Authors et al.")),
|
|
}.get(selector)
|
|
|
|
# Citations
|
|
mock_citation_link = AsyncMock()
|
|
mock_citation_link.inner_text.return_value = "Cited by 123"
|
|
mock_res.query_selector_all.return_value = [mock_citation_link]
|
|
|
|
mock_page.query_selector_all.return_value = [mock_res]
|
|
|
|
items = await crawler.fetch_latest()
|
|
|
|
assert len(items) == 1
|
|
assert items[0].title == "LLM Paper Title"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_no_title():
|
|
crawler = ScholarCrawler()
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
mock_page.content.return_value = "<html><body>Results</body></html>"
|
|
|
|
# Result item without title link
|
|
mock_res = AsyncMock()
|
|
mock_res.query_selector.return_value = None
|
|
mock_page.query_selector_all.return_value = [mock_res]
|
|
|
|
items = await crawler.fetch_latest()
|
|
assert len(items) == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_exception():
|
|
crawler = ScholarCrawler()
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
|
|
# Force exception
|
|
mock_browser.new_context.side_effect = Exception("Browser error")
|
|
|
|
items = await crawler.fetch_latest()
|
|
assert items == []
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_captcha():
|
|
crawler = ScholarCrawler()
|
|
with patch("src.crawlers.scholar_crawler.async_playwright") as mock_playwright:
|
|
mock_p = AsyncMock()
|
|
mock_playwright.return_value.__aenter__.return_value = mock_p
|
|
mock_browser = AsyncMock()
|
|
mock_p.chromium.launch.return_value = mock_browser
|
|
mock_context = AsyncMock()
|
|
mock_browser.new_context.return_value = mock_context
|
|
mock_page = AsyncMock()
|
|
mock_context.new_page.return_value = mock_page
|
|
|
|
# Simulate CAPTCHA
|
|
mock_page.content.return_value = "<html><body>Please solve this CAPTCHA</body></html>"
|
|
|
|
items = await crawler.fetch_latest()
|
|
assert items == []
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scholar_crawler_url_year_filter():
|
|
"""Verify that the crawler filters results from the last 5 years."""
|
|
current_year = datetime.now().year
|
|
expected_year = current_year - 5
|
|
query = "Edge AI"
|
|
crawler = ScholarCrawler(query=query)
|
|
|
|
# The URL should include the lower year bound filter
|
|
assert f"&as_ylo={expected_year}" in crawler.url
|