:Release Notes:
- Updated the Google Scholar crawler to automatically filter out results older than 5 years to ensure recent content.
:Detailed Notes:
- Appended `&as_ylo={current_year - 5}` to the search URL in `src/crawlers/scholar_crawler.py` by dynamically calculating the current year via Python's `datetime`.
- Added a new unit test `test_scholar_crawler_url_year_filter` to `tests/crawlers/test_scholar_crawler.py` to verify URL construction.
:Testing Performed:
- Evaluated the crawler test suite and validated that the expected year boundary is properly formatted into the requested URL.
- All 91 automated pytest cases complete successfully.
:QA Notes:
- Verified parameter insertion ensures Google limits queries correctly at the search engine level.
:Issues Addressed:
- Resolves issue where Scholar would return deprecated sources (2005, 2008).
Change-Id: I56ae2fd7369d61494d17520238c3ef66e14436c7
94 lines
4.3 KiB
Python
94 lines
4.3 KiB
Python
import logging
|
|
from typing import List, Optional
|
|
from playwright.async_api import async_playwright
|
|
from playwright_stealth import Stealth
|
|
from datetime import datetime, timezone
|
|
from urllib.parse import urljoin
|
|
|
|
from .base import ICrawler
|
|
from .dto import NewsItemDTO
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class ScholarCrawler(ICrawler):
|
|
def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
|
|
self.query = query
|
|
current_year = datetime.now().year
|
|
# Google Scholar query URL
|
|
self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}"
|
|
self.source = source
|
|
|
|
async def fetch_latest(self) -> List[NewsItemDTO]:
|
|
try:
|
|
async with async_playwright() as p:
|
|
# Launch browser
|
|
browser = await p.chromium.launch(headless=True)
|
|
try:
|
|
# Create a new context with a realistic user agent
|
|
context = await browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
page = await context.new_page()
|
|
|
|
# Apply stealth to avoid detection
|
|
await Stealth().apply_stealth_async(page)
|
|
|
|
logger.info(f"Navigating to {self.url}")
|
|
await page.goto(self.url, wait_until="networkidle", timeout=60000)
|
|
|
|
# Check for CAPTCHA or blocking
|
|
content = await page.content()
|
|
if "CAPTCHA" in content or "not a robot" in content:
|
|
logger.warning("Google Scholar CAPTCHA or bot detection triggered")
|
|
return []
|
|
|
|
# Select result items
|
|
results = await page.query_selector_all(".gs_ri")
|
|
news_items = []
|
|
|
|
for res in results:
|
|
# Title element
|
|
title_el = await res.query_selector(".gs_rt a")
|
|
if not title_el:
|
|
continue
|
|
|
|
title = await title_el.inner_text()
|
|
url = await title_el.get_attribute("href")
|
|
|
|
# Snippet/Abstract
|
|
snippet_el = await res.query_selector(".gs_rs")
|
|
snippet = await snippet_el.inner_text() if snippet_el else ""
|
|
|
|
# Metadata (authors, journal, year)
|
|
metadata_el = await res.query_selector(".gs_a")
|
|
metadata = await metadata_el.inner_text() if metadata_el else ""
|
|
|
|
# Citation count (usually in the bottom links)
|
|
# We look for a link that starts with "Cited by"
|
|
citation_count = "0"
|
|
bottom_links = await res.query_selector_all(".gs_fl a")
|
|
for link in bottom_links:
|
|
text = await link.inner_text()
|
|
if "Cited by" in text:
|
|
citation_count = text.replace("Cited by", "").strip()
|
|
break
|
|
|
|
content_text = f"{metadata}\n\n{snippet}\n\nCitations: {citation_count}"
|
|
|
|
news_items.append(
|
|
NewsItemDTO(
|
|
title=title.strip(),
|
|
url=url or self.url,
|
|
content_text=content_text.strip(),
|
|
source=f"{self.source}: {self.query}",
|
|
timestamp=datetime.now(timezone.utc)
|
|
)
|
|
)
|
|
|
|
return news_items
|
|
finally:
|
|
await browser.close()
|
|
except Exception as e:
|
|
logger.error(f"Error crawling Google Scholar: {e}")
|
|
return []
|