import logging from typing import List, Optional from playwright.async_api import async_playwright from playwright_stealth import Stealth from datetime import datetime, timezone from urllib.parse import urljoin from .base import ICrawler from .dto import NewsItemDTO logger = logging.getLogger(__name__) class ScholarCrawler(ICrawler): def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"): self.query = query current_year = datetime.now().year # Google Scholar query URL self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}" self.source = source async def fetch_latest(self) -> List[NewsItemDTO]: try: async with async_playwright() as p: # Launch browser browser = await p.chromium.launch(headless=True) try: # Create a new context with a realistic user agent context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" ) page = await context.new_page() # Apply stealth to avoid detection await Stealth().apply_stealth_async(page) logger.info(f"Navigating to {self.url}") await page.goto(self.url, wait_until="networkidle", timeout=60000) # Check for CAPTCHA or blocking content = await page.content() if "CAPTCHA" in content or "not a robot" in content: logger.warning("Google Scholar CAPTCHA or bot detection triggered") return [] # Select result items results = await page.query_selector_all(".gs_ri") news_items = [] for res in results: # Title element title_el = await res.query_selector(".gs_rt a") if not title_el: continue title = await title_el.inner_text() url = await title_el.get_attribute("href") # Snippet/Abstract snippet_el = await res.query_selector(".gs_rs") snippet = await snippet_el.inner_text() if snippet_el else "" # Metadata (authors, journal, year) metadata_el = await res.query_selector(".gs_a") metadata = await metadata_el.inner_text() if metadata_el else "" # Citation count (usually in the bottom links) # We look for a link that starts with "Cited by" citation_count = "0" bottom_links = await res.query_selector_all(".gs_fl a") for link in bottom_links: text = await link.inner_text() if "Cited by" in text: citation_count = text.replace("Cited by", "").strip() break content_text = f"{metadata}\n\n{snippet}\n\nCitations: {citation_count}" news_items.append( NewsItemDTO( title=title.strip(), url=url or self.url, content_text=content_text.strip(), source=f"{self.source}: {self.query}", timestamp=datetime.now(timezone.utc) ) ) return news_items finally: await browser.close() except Exception as e: logger.error(f"Error crawling Google Scholar: {e}") return []