AI-Trend-Scout/src/crawlers/scholar_crawler.py

import logging
from typing import List, Optional
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from datetime import datetime, timezone
from urllib.parse import urljoin

from .base import ICrawler
from .dto import NewsItemDTO

logger = logging.getLogger(__name__)

class ScholarCrawler(ICrawler):
    def __init__(self, query: str = "Artificial Intelligence", source: str = "Google Scholar"):
        self.query = query
        current_year = datetime.now().year
        # Google Scholar query URL
        self.url = f"https://scholar.google.com/scholar?hl=en&q={query.replace(' ', '+')}&as_ylo={current_year - 5}"
        self.source = source

    async def fetch_latest(self) -> List[NewsItemDTO]:
        try:
            async with async_playwright() as p:
                # Launch browser
                browser = await p.chromium.launch(headless=True)
                try:
                    # Create a new context with a realistic user agent
                    context = await browser.new_context(
                        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
                    )
                    page = await context.new_page()

                    # Apply stealth to avoid detection
                    await Stealth().apply_stealth_async(page)

                    logger.info(f"Navigating to {self.url}")
                    await page.goto(self.url, wait_until="networkidle", timeout=60000)

                    # Check for CAPTCHA or blocking
                    content = await page.content()
                    if "CAPTCHA" in content or "not a robot" in content:
                        logger.warning("Google Scholar CAPTCHA or bot detection triggered")
                        return []

                    # Select result items
                    results = await page.query_selector_all(".gs_ri")
                    news_items = []

                    for res in results:
                        # Title element
                        title_el = await res.query_selector(".gs_rt a")
                        if not title_el:
                            continue

                        title = await title_el.inner_text()
                        url = await title_el.get_attribute("href")

                        # Snippet/Abstract
                        snippet_el = await res.query_selector(".gs_rs")
                        snippet = await snippet_el.inner_text() if snippet_el else ""

                        # Metadata (authors, journal, year)
                        metadata_el = await res.query_selector(".gs_a")
                        metadata = await metadata_el.inner_text() if metadata_el else ""

                        # Citation count (usually in the bottom links)
                        # We look for a link that starts with "Cited by"
                        citation_count = "0"
                        bottom_links = await res.query_selector_all(".gs_fl a")
                        for link in bottom_links:
                            text = await link.inner_text()
                            if "Cited by" in text:
                                citation_count = text.replace("Cited by", "").strip()
                                break

                        content_text = f"{metadata}\n\n{snippet}\n\nCitations: {citation_count}"

                        news_items.append(
                            NewsItemDTO(
                                title=title.strip(),
                                url=url or self.url,
                                content_text=content_text.strip(),
                                source=f"{self.source}: {self.query}",
                                timestamp=datetime.now(timezone.utc)
                            )
                        )

                    return news_items
                finally:
                    await browser.close()
        except Exception as e:
            logger.error(f"Error crawling Google Scholar: {e}")
            return []