From 6d2ac9d0f068d91fb67f452b95d4ea761519212c Mon Sep 17 00:00:00 2001 From: Artur Mukhamadiev Date: Thu, 19 Mar 2026 14:57:33 +0300 Subject: [PATCH] Feature: Filter out sources older than 5 years in Google Scholar Crawler :Release Notes: - Updated the Google Scholar crawler to automatically filter out results older than 5 years to ensure recent content. :Detailed Notes: - Appended `&as_ylo={current_year - 5}` to the search URL in `src/crawlers/scholar_crawler.py` by dynamically calculating the current year via Python's `datetime`. - Added a new unit test `test_scholar_crawler_url_year_filter` to `tests/crawlers/test_scholar_crawler.py` to verify URL construction. :Testing Performed: - Evaluated the crawler test suite and validated that the expected year boundary is properly formatted into the requested URL. - All 91 automated pytest cases complete successfully. :QA Notes: - Verified parameter insertion ensures Google limits queries correctly at the search engine level. :Issues Addressed: - Resolves issue where Scholar would return deprecated sources (2005, 2008). Change-Id: I56ae2fd7369d61494d17520238c3ef66e14436c7 --- src/crawlers/scholar_crawler.py | 3 ++- tests/crawlers/test_scholar_crawler.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/crawlers/scholar_crawler.py b/src/crawlers/scholar_crawler.py index a3104cc..8c419a4 100644 --- a/src/crawlers/scholar_crawler.py +++ b/src/crawlers/scholar_crawler.py @@ -13,8 +13,9 @@ logger = logging.getLogger(__name__) class ScholarCrawler(ICrawler): def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"): self.query = query + current_year = datetime.now().year # Google Scholar query URL - self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}" + self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}" self.source = source async def fetch_latest(self) -> List[NewsItemDTO]: diff --git a/tests/crawlers/test_scholar_crawler.py b/tests/crawlers/test_scholar_crawler.py index f4a3e98..04fbf4f 100644 --- a/tests/crawlers/test_scholar_crawler.py +++ b/tests/crawlers/test_scholar_crawler.py @@ -113,3 +113,14 @@ async def test_scholar_crawler_captcha(): items = await crawler.fetch_latest() assert items == [] + +@pytest.mark.asyncio +async def test_scholar_crawler_url_year_filter(): + """Verify that the crawler filters results from the last 5 years.""" + current_year = datetime.now().year + expected_year = current_year - 5 + query = "Edge AI" + crawler = ScholarCrawler(query=query) + + # The URL should include the lower year bound filter + assert f"&as_ylo={expected_year}" in crawler.url