Feature: Filter out sources older than 5 years in Google Scholar Crawler
:Release Notes:
- Updated the Google Scholar crawler to automatically filter out results older than 5 years to ensure recent content.
:Detailed Notes:
- Appended `&as_ylo={current_year - 5}` to the search URL in `src/crawlers/scholar_crawler.py` by dynamically calculating the current year via Python's `datetime`.
- Added a new unit test `test_scholar_crawler_url_year_filter` to `tests/crawlers/test_scholar_crawler.py` to verify URL construction.
:Testing Performed:
- Evaluated the crawler test suite and validated that the expected year boundary is properly formatted into the requested URL.
- All 91 automated pytest cases complete successfully.
:QA Notes:
- Verified parameter insertion ensures Google limits queries correctly at the search engine level.
:Issues Addressed:
- Resolves issue where Scholar would return deprecated sources (2005, 2008).
Change-Id: I56ae2fd7369d61494d17520238c3ef66e14436c7
This commit is contained in:
parent
e1c7f47f8f
commit
6d2ac9d0f0
@ -13,8 +13,9 @@ logger = logging.getLogger(__name__)
|
|||||||
class ScholarCrawler(ICrawler):
|
class ScholarCrawler(ICrawler):
|
||||||
def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
|
def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
|
||||||
self.query = query
|
self.query = query
|
||||||
|
current_year = datetime.now().year
|
||||||
# Google Scholar query URL
|
# Google Scholar query URL
|
||||||
self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}"
|
self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}"
|
||||||
self.source = source
|
self.source = source
|
||||||
|
|
||||||
async def fetch_latest(self) -> List[NewsItemDTO]:
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
||||||
|
|||||||
@ -113,3 +113,14 @@ async def test_scholar_crawler_captcha():
|
|||||||
|
|
||||||
items = await crawler.fetch_latest()
|
items = await crawler.fetch_latest()
|
||||||
assert items == []
|
assert items == []
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scholar_crawler_url_year_filter():
|
||||||
|
"""Verify that the crawler filters results from the last 5 years."""
|
||||||
|
current_year = datetime.now().year
|
||||||
|
expected_year = current_year - 5
|
||||||
|
query = "Edge AI"
|
||||||
|
crawler = ScholarCrawler(query=query)
|
||||||
|
|
||||||
|
# The URL should include the lower year bound filter
|
||||||
|
assert f"&as_ylo={expected_year}" in crawler.url
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user