diff --git a/src/crawlers/scholar_crawler.py b/src/crawlers/scholar_crawler.py index a3104cc..8c419a4 100644 --- a/src/crawlers/scholar_crawler.py +++ b/src/crawlers/scholar_crawler.py @@ -13,8 +13,9 @@ logger = logging.getLogger(__name__) class ScholarCrawler(ICrawler): def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"): self.query = query + current_year = datetime.now().year # Google Scholar query URL - self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}" + self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}" self.source = source async def fetch_latest(self) -> List[NewsItemDTO]: diff --git a/tests/crawlers/test_scholar_crawler.py b/tests/crawlers/test_scholar_crawler.py index f4a3e98..04fbf4f 100644 --- a/tests/crawlers/test_scholar_crawler.py +++ b/tests/crawlers/test_scholar_crawler.py @@ -113,3 +113,14 @@ async def test_scholar_crawler_captcha(): items = await crawler.fetch_latest() assert items == [] + +@pytest.mark.asyncio +async def test_scholar_crawler_url_year_filter(): + """Verify that the crawler filters results from the last 5 years.""" + current_year = datetime.now().year + expected_year = current_year - 5 + query = "Edge AI" + crawler = ScholarCrawler(query=query) + + # The URL should include the lower year bound filter + assert f"&as_ylo={expected_year}" in crawler.url