AI-Trend-Scout/src/crawlers/scholar_crawler.py
Artur Mukhamadiev 6d2ac9d0f0 Feature: Filter out sources older than 5 years in Google Scholar Crawler
:Release Notes:
- Updated the Google Scholar crawler to automatically filter out results older than 5 years to ensure recent content.

:Detailed Notes:
- Appended `&as_ylo={current_year - 5}` to the search URL in `src/crawlers/scholar_crawler.py` by dynamically calculating the current year via Python's `datetime`.
- Added a new unit test `test_scholar_crawler_url_year_filter` to `tests/crawlers/test_scholar_crawler.py` to verify URL construction.

:Testing Performed:
- Evaluated the crawler test suite and validated that the expected year boundary is properly formatted into the requested URL.
- All 91 automated pytest cases complete successfully.

:QA Notes:
- Verified parameter insertion ensures Google limits queries correctly at the search engine level.

:Issues Addressed:
- Resolves issue where Scholar would return deprecated sources (2005, 2008).

Change-Id: I56ae2fd7369d61494d17520238c3ef66e14436c7
2026-03-19 14:57:33 +03:00

94 lines
4.3 KiB
Python

import logging
from typing import List, Optional
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from datetime import datetime, timezone
from urllib.parse import urljoin
from .base import ICrawler
from .dto import NewsItemDTO
logger = logging.getLogger(__name__)
class ScholarCrawler(ICrawler):
def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
self.query = query
current_year = datetime.now().year
# Google Scholar query URL
self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}"
self.source = source
async def fetch_latest(self) -> List[NewsItemDTO]:
try:
async with async_playwright() as p:
# Launch browser
browser = await p.chromium.launch(headless=True)
try:
# Create a new context with a realistic user agent
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
page = await context.new_page()
# Apply stealth to avoid detection
await Stealth().apply_stealth_async(page)
logger.info(f"Navigating to {self.url}")
await page.goto(self.url, wait_until="networkidle", timeout=60000)
# Check for CAPTCHA or blocking
content = await page.content()
if "CAPTCHA" in content or "not a robot" in content:
logger.warning("Google Scholar CAPTCHA or bot detection triggered")
return []
# Select result items
results = await page.query_selector_all(".gs_ri")
news_items = []
for res in results:
# Title element
title_el = await res.query_selector(".gs_rt a")
if not title_el:
continue
title = await title_el.inner_text()
url = await title_el.get_attribute("href")
# Snippet/Abstract
snippet_el = await res.query_selector(".gs_rs")
snippet = await snippet_el.inner_text() if snippet_el else ""
# Metadata (authors, journal, year)
metadata_el = await res.query_selector(".gs_a")
metadata = await metadata_el.inner_text() if metadata_el else ""
# Citation count (usually in the bottom links)
# We look for a link that starts with "Cited by"
citation_count = "0"
bottom_links = await res.query_selector_all(".gs_fl a")
for link in bottom_links:
text = await link.inner_text()
if "Cited by" in text:
citation_count = text.replace("Cited by", "").strip()
break
content_text = f"{metadata}\n\n{snippet}\n\nCitations: {citation_count}"
news_items.append(
NewsItemDTO(
title=title.strip(),
url=url or self.url,
content_text=content_text.strip(),
source=f"{self.source}: {self.query}",
timestamp=datetime.now(timezone.utc)
)
)
return news_items
finally:
await browser.close()
except Exception as e:
logger.error(f"Error crawling Google Scholar: {e}")
return []