From 87af585e1b23825c65af817f3cf83b29aacbe91a Mon Sep 17 00:00:00 2001
From: Artur Mukhamadiev <muhamadiev1@gmail.com>
Date: Sun, 15 Mar 2026 00:45:04 +0300
Subject: [PATCH] Refactor crawlers configuration and add new sources

- Move hard-coded crawlers from main.py to crawlers.yml
- Use CrawlerFactory to load configuration
- Add 9 new sources: C++ Russia, ICRA 2025, Technoprom, INNOPROM, Hannover Messe, RSF, Skolkovo, Horizon Europe, Addmeto
- Update task list
---
 .gitignore                                    |   4 +
 .../tasks/crawler-refactoring-tasklist.md     |  85 +++++++++++++
 src/crawlers.yml                              |  87 +++++++++++++
 src/crawlers/factory.py                       |  45 +++++++
 src/main.py                                   |  54 +++-----
 src/processor/ollama_provider.py              |  52 +++++---
 tests/crawlers/test_factory.py                | 120 ++++++++++++++++++
 7 files changed, 396 insertions(+), 51 deletions(-)
 create mode 100644 ai/memory-bank/tasks/crawler-refactoring-tasklist.md
 create mode 100644 src/crawlers.yml
 create mode 100644 src/crawlers/factory.py
 create mode 100644 tests/crawlers/test_factory.py

diff --git a/.gitignore b/.gitignore
index e15106e..4a473e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -214,3 +214,7 @@ __marimo__/
 
 # Streamlit
 .streamlit/secrets.toml
+
+chroma_db/
+hidden_docs/
+.opencode
\ No newline at end of file
diff --git a/ai/memory-bank/tasks/crawler-refactoring-tasklist.md b/ai/memory-bank/tasks/crawler-refactoring-tasklist.md
new file mode 100644
index 0000000..1e13e28
--- /dev/null
+++ b/ai/memory-bank/tasks/crawler-refactoring-tasklist.md
@@ -0,0 +1,85 @@
+# Crawler Refactoring & Source Expansion Development Tasks
+
+## Specification Summary
+**Original Requirements**: Move hard-coded crawlers from `src/main.py` to `src/crawlers.yml`. Add new sources from the provided table (IT Conferences, Scientific Forums, Exhibitions, Grants, Journals, Startups, Blogs).
+**Technical Stack**: Python, aiogram, ChromaDB, Playwright, RSS, YAML.
+**Target Timeline**: Immediate refactoring and expansion.
+
+## Development Tasks
+
+### [x] Task 1: Clean up `src/main.py`
+**Description**: Refactor `src/main.py` to load crawlers from `src/crawlers.yml` using `CrawlerFactory.load_from_yaml()`.
+**Acceptance Criteria**: 
+- `src/main.py` no longer contains hard-coded crawler instances.
+- Bot starts and correctly loads crawlers from the YAML file.
+- Logging confirms the number of loaded crawlers.
+
+**Files to Edit**:
+- `src/main.py`
+
+### [x] Task 2: Verify and Update `src/crawlers.yml` for Existing Sources
+**Description**: Ensure all crawlers previously hard-coded in `src/main.py` are present in `src/crawlers.yml`.
+**Acceptance Criteria**:
+- All 16 original sources from `main.py` are correctly configured in `crawlers.yml`.
+- Selectors for Playwright crawlers (CVPR, CES) are verified.
+
+**Files to Edit**:
+- `src/crawlers.yml`
+
+### [x] Task 3: Add New IT Conference Sources
+**Description**: Add C++ Russia and ICRA 2025 to `crawlers.yml`.
+**Acceptance Criteria**:
+- C++ Russia (`https://cppconf.ru/`) added (suggest using Playwright).
+- ICRA 2025 (`https://www.icra2025.org/`) added (suggest using Playwright).
+- Correct selectors identified for both.
+
+**Reference**: Table Category "IT Conferences"
+
+### [x] Task 4: Add Scientific Forums and Exhibitions
+**Description**: Add Technoprom-2025, INNOPROM-2025, and Hannover Messe.
+**Acceptance Criteria**:
+- Technoprom-2025 (`https://форумтехнопром.рф/`) added.
+- INNOPROM-2025 (`https://innoprom.com/en/`) added.
+- Hannover Messe (`https://www.hannovermesse.de/en/`) added.
+- All use appropriate Playwright selectors.
+
+**Reference**: Table Categories "Scientific Forums", "Exhibitions"
+
+### [x] Task 5: Add Grants and Funds Sources
+**Description**: Add RSF, Skolkovo, and Horizon Europe.
+**Acceptance Criteria**:
+- RSF (`https://rscf.ru/en/news/`) added.
+- Skolkovo (`https://sk.ru/news/`) added.
+- Horizon Europe (`https://research-and-innovation.ec.europa.eu/news_en`) added.
+- Research if RSS is available for these, otherwise use Playwright.
+
+**Reference**: Table Category "Grants and Funds"
+
+### [x] Task 6: Add Telegram: Addmeto Source
+**Description**: Add the Addmeto Telegram channel to the crawlers.
+**Acceptance Criteria**:
+- Source `https://t.me/s/addmeto` added.
+- Use Playwright with selector `.tgme_widget_message_text` to extract content.
+
+**Reference**: Table Category "Blogs and Channels"
+
+### [x] Task 7: Quality Assurance and Integration Testing
+**Description**: Verify that the new crawlers work and don't break the system.
+**Acceptance Criteria**:
+- Run existing tests: `pytest tests/`
+- Run a trial iteration with a limited number of crawlers (can be done via a temporary test script).
+- Verify that storage (ChromaDB) correctly handles new sources.
+
+
+**QA Tool**: `./qa-playwright-capture.sh http://localhost:8000 public/qa-screenshots` (if applicable, though this is a bot, so maybe check logs).
+
+## Quality Requirements
+- [ ] All crawlers return standardized DTOs.
+- [ ] No hard-coded credentials in YAML.
+- [ ] Proper error handling for failed crawlers (already in Orchestrator).
+- [ ] Summarization works for new sources in Russian.
+
+## Technical Notes
+**Crawler Types**: 
+- Use `rss` for Nature, Science, UFN, VC.ru, RB.ru, TAdviser, Google Blogs, Yandex Tech, Habr.
+- Use `playwright` for Conferences, Exhibitions, and Telegram.
diff --git a/src/crawlers.yml b/src/crawlers.yml
new file mode 100644
index 0000000..ccaf1d4
--- /dev/null
+++ b/src/crawlers.yml
@@ -0,0 +1,87 @@
+crawlers:
+  - type: rss
+    url: "https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru"
+    source: "Habr AI"
+  - type: rss
+    url: "https://www.nature.com/nature.rss"
+    source: "Nature"
+  - type: rss
+    url: "https://news.samsung.com/global/rss"
+    source: "Samsung Newsroom"
+  - type: rss
+    url: "https://www.sony.com/en/SonyInfo/News/Service/rss.xml"
+    source: "Sony Newsroom"
+  - type: playwright
+    url: "https://cvpr.thecvf.com/Conferences/2025"
+    source: "CVPR 2025"
+    selector: ".conference-news-item"
+  - type: playwright
+    url: "https://www.ces.tech/discover/?type=Article%2CSuccess+Story%2CPodcast&sort=desc&topics=Artificial+Intelligence%2CContent+and+Entertainment%2CAccessibility%2CInnovation+For+All"
+    source: "CES 2025"
+    selector: ".press-release-item"
+  - type: rss
+    url: "https://vc.ru/rss/tag/tech"
+    source: "VC.ru Tech"
+  - type: rss
+    url: "https://vc.ru/rss/tag/iot"
+    source: "vc.ru IoT"
+  - type: rss
+    url: "https://rb.ru/feeds/tag/iot"
+    source: "RB.ru IoT"
+  - type: rss
+    url: "https://www.science.org/rss/news_current.xml"
+    source: "Science News"
+  - type: rss
+    url: "https://ufn.ru/en/articles/rss.xml?pacs=03,84"
+    source: "УФН; PACS: 03,84"
+  - type: rss
+    url: "https://www.tadviser.ru/xml/tadviser.xml"
+    source: "TAdviser"
+  - type: rss
+    url: "https://blog.google/innovation-and-ai/technology/ai/rss/"
+    source: "Google AI Blog"
+  - type: rss
+    url: "https://habr.com/ru/rss/company/yandex/blog/"
+    source: "Yandex Tech"
+  - type: rss
+    url: "https://blog.google/products-and-platforms/products/chrome/rss/"
+    source: "Google Chrome Blog"
+  - type: rss
+    url: "https://blog.google/products-and-platforms/platforms/android/rss/"
+    source: "Google Android Blog"
+  - type: playwright
+    url: "https://cppconf.ru/"
+    source: "C++ Russia"
+    selector: ".talks h3 a"
+  - type: playwright
+    url: "https://www.icra2025.org/"
+    source: "ICRA 2025"
+    selector: "h2 a"
+  - type: playwright
+    url: "https://форумтехнопром.рф/"
+    source: "Technoprom-2025"
+    selector: ".news-card a"
+  - type: playwright
+    url: "https://innoprom.com/en/news/"
+    source: "INNOPROM-2025"
+    selector: ".news-item a"
+  - type: playwright
+    url: "https://www.hannovermesse.de/en/press/press-releases/hannover-messe/"
+    source: "Hannover Messe"
+    selector: ".media-item a"
+  - type: playwright
+    url: "https://rscf.ru/en/news/"
+    source: "RSF"
+    selector: ".news-item a"
+  - type: playwright
+    url: "https://sk.ru/news/"
+    source: "Skolkovo"
+    selector: ".news-item a"
+  - type: playwright
+    url: "https://research-and-innovation.ec.europa.eu/news_en"
+    source: "Horizon Europe"
+    selector: ".ecl-news-item a"
+  - type: playwright
+    url: "https://t.me/s/addmeto"
+    source: "Addmeto"
+    selector: ".tgme_widget_message_text"
diff --git a/src/crawlers/factory.py b/src/crawlers/factory.py
new file mode 100644
index 0000000..4aa6a91
--- /dev/null
+++ b/src/crawlers/factory.py
@@ -0,0 +1,45 @@
+import yaml
+import logging
+from typing import List
+from src.crawlers.base import ICrawler
+from src.crawlers.rss_crawler import RSSCrawler
+from src.crawlers.playwright_crawler import PlaywrightCrawler
+
+logger = logging.getLogger(__name__)
+
+class CrawlerFactory:
+    @staticmethod
+    def load_from_yaml(file_path: str) -> List[ICrawler]:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                config = yaml.safe_load(f)
+            
+            if not config or not isinstance(config, dict):
+                logger.warning(f"Invalid or empty configuration in {file_path}")
+                return []
+            
+            crawlers = []
+            for item in config.get('crawlers', []):
+                if not isinstance(item, dict):
+                    continue
+                
+                crawler_type = item.get('type')
+                url = item.get('url')
+                source = item.get('source')
+                
+                if not url or not source:
+                    logger.warning(f"Missing mandatory fields (url, source) for crawler: {item}")
+                    continue
+                
+                if crawler_type == 'rss':
+                    crawlers.append(RSSCrawler(url=url, source=source))
+                elif crawler_type == 'playwright':
+                    selector = item.get('selector')
+                    crawlers.append(PlaywrightCrawler(url=url, source=source, selector=selector))
+                else:
+                    logger.warning(f"Unknown crawler type: {crawler_type}")
+            
+            return crawlers
+        except Exception as e:
+            logger.error(f"Failed to load crawlers from {file_path}: {e}")
+            return []
diff --git a/src/main.py b/src/main.py
index 7891fec..ee90115 100644
--- a/src/main.py
+++ b/src/main.py
@@ -7,18 +7,18 @@ import chromadb
 
 from aiogram import Bot, Dispatcher
 
-from src.crawlers.base import ICrawler
-from src.crawlers.rss_crawler import RSSCrawler
-from src.crawlers.playwright_crawler import PlaywrightCrawler
+from src.crawlers.factory import CrawlerFactory
 from src.processor.ollama_provider import OllamaProvider
 from src.storage.chroma_store import ChromaStore
 from src.notifications.telegram import TelegramNotifier
 from src.orchestrator.service import TrendScoutService
 from src.bot.bot import setup_bot
 
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logging.basicConfig(level=logging.INFO,
+                    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 
+
 async def background_task(orchestrator: TrendScoutService, interval: int = 3600):
     """Run the orchestrator periodically."""
     while True:
@@ -28,72 +28,60 @@ async def background_task(orchestrator: TrendScoutService, interval: int = 3600)
             logger.info("Iteration completed successfully.")
         except Exception as e:
             logger.error(f"Error during iteration: {e}", exc_info=True)
-        
+
         logger.info(f"Sleeping for {interval} seconds before next iteration.")
         await asyncio.sleep(interval)
 
+
 async def main():
     load_dotenv()
 
     # Load configuration
     bot_token = os.getenv("TELEGRAM_BOT_TOKEN")
     chat_id = os.getenv("TELEGRAM_CHAT_ID", "")
-    ollama_url = os.getenv("OLLAMA_API_URL", "http://localhost:11434/api/generate")
+    ollama_url = os.getenv(
+        "OLLAMA_API_URL", "http://localhost:11434/api/generate")
     chroma_db_path = os.getenv("CHROMA_DB_PATH", "./chroma_db")
-    
+
     if not bot_token:
         logger.error("TELEGRAM_BOT_TOKEN is missing!")
         return
-        
+
     if not chat_id or chat_id == "YOUR_CHAT_ID_HERE":
-        logger.warning("TELEGRAM_CHAT_ID is missing or not set. Notifications will fail.")
+        logger.warning(
+            "TELEGRAM_CHAT_ID is missing or not set. Notifications will fail.")
 
     # 1. Initialize Components that do not depend on Bot
-    crawlers: List[ICrawler] = [
-        RSSCrawler("https://habr.com/ru/rss/hubs/artificial_intelligence/articles/?fl=ru", source="Habr AI"),
-        RSSCrawler("https://www.nature.com/nature.rss", source="Nature"),
-        RSSCrawler("https://news.google.com/rss/search?q=WebOS+Chromium+Edge+AI+LGE+SmartTV&hl=en-US&gl=US&ceid=US:en", source="Google News R&D"),
-        RSSCrawler("https://news.samsung.com/global/rss", source="Samsung Newsroom"),
-        RSSCrawler("https://www.sony.com/en/SonyInfo/News/Service/rss.xml", source="Sony Newsroom"),
-        PlaywrightCrawler("https://cvpr.thecvf.com/Conferences/2025", source="CVPR 2025", selector=".conference-news-item"),
-        PlaywrightCrawler("https://www.ces.tech/news/press-releases.aspx", source="CES 2025", selector=".press-release-item"),
-        RSSCrawler("https://vc.ru/rss/tech", source="VC.ru Tech"),
-        RSSCrawler("https://rb.ru/rss/", source="RB.ru"),
-        RSSCrawler("https://www.science.org/rss/news_current.xml", source="Science News"),
-        RSSCrawler("https://ufn.ru/en/rss/", source="УФН"),
-        RSSCrawler("https://www.tadviser.ru/xml/tadviser.xml", source="TAdviser"),
-        RSSCrawler("https://habr.com/ru/rss/company/yandex/blog/", source="Yandex Tech"),
-        RSSCrawler("https://blog.google/technology/ai/rss/", source="Google AI Blog"),
-    ]
-    
+    crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
+
     processor = OllamaProvider()
-    
+
     if chroma_db_path:
         chroma_client = chromadb.PersistentClient(path=chroma_db_path)
     else:
         chroma_client = chromadb.Client()
-        
+
     storage = ChromaStore(client=chroma_client)
 
     # 2. Initialize Bot & Dispatcher
     bot, dp = setup_bot(bot_token, storage, processor, chat_id)
-    
+
     # 3. Initialize Notifier and Orchestrator
     notifier = TelegramNotifier(bot, chat_id)
-    
+
     orchestrator = TrendScoutService(
         crawlers=crawlers,
         processor=processor,
         storage=storage,
         notifier=notifier
     )
-    
+
     # 4. Start tasks
     logger.info("Starting TrendScout AI Bot and Background Task...")
-    
+
     # Create the background task
     bg_task = asyncio.create_task(background_task(orchestrator, interval=3600))
-    
+
     # Start polling the Telegram bot (blocking call)
     try:
         await dp.start_polling(bot)
diff --git a/src/processor/ollama_provider.py b/src/processor/ollama_provider.py
index 8f956dd..8797a71 100644
--- a/src/processor/ollama_provider.py
+++ b/src/processor/ollama_provider.py
@@ -6,6 +6,7 @@ from src.crawlers.dto import NewsItemDTO
 from src.processor.base import ILLMProvider
 from src.processor.dto import EnrichedNewsItemDTO
 
+
 class OllamaProvider(ILLMProvider):
     def get_info(self) -> dict[str, str]:
         base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
@@ -18,17 +19,31 @@ class OllamaProvider(ILLMProvider):
 
     async def analyze(self, news_item: NewsItemDTO) -> EnrichedNewsItemDTO:
         base_url = os.environ.get('OLLAMA_API_URL', 'http://localhost:11434')
-        url = base_url if base_url.endswith('/api/generate') else f"{base_url.rstrip('/')}/api/generate"
+        url = base_url if base_url.endswith(
+            '/api/generate') else f"{base_url.rstrip('/')}/api/generate"
         prompt = (
-            f"Analyze the following article.\nTitle: {news_item.title}\n"
-            f"Content: {news_item.content_text}\n"
-            "Return JSON with 'relevance_score' (0-10), 'summary_ru' (string), 'anomalies_detected' (list of strings), and 'category' (string).\n"
-            "The 'summary_ru' MUST be in Russian and strictly NO MORE than 2 sentences.\n"
-            "The 'category' must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n"
-            "For 'relevance_score', prioritize and give higher scores to articles related to R&D, Chromium, NPU, and Smart TV operating systems.\n"
-            "Regarding 'anomalies_detected': only detect factual, conceptual, or industry-related anomalies (e.g., sudden technological shifts, unexpected competitor moves). "
-            "DO NOT detect technical anomalies related to the text's formatting, HTML tags, metadata, or document structure. "
-            "If no real anomalies are found, return an empty list."
+            "Act as a Strategic Tech Scout for an R&D department specializing in WebEngine (Chromium) extensions, "
+            "cross-platform porting, Middleware platform solutions, and System Tools (SWE) for developers.\n\n"
+            f"Analyze the following article.\nTitle: {news_item.title}\nContent: {news_item.content_text}\n\n"
+
+            "Return a JSON object with: 'relevance_score' (integer 0-10), 'summary_ru' (string), "
+            "'anomalies_detected' (list of strings), and 'category' (string).\n\n"
+
+            "OUTPUT RULES:\n"
+            "1. 'summary_ru': MUST be in Russian and strictly NO MORE than 2 sentences. Focus on the technological or business value for an R&D team.\n"
+            "2. 'category': Must be exactly one of: 'Browsers', 'Edge AI', 'SmartTV', 'Samsung New Technologies', 'Middleware new trends', 'Competitors', 'Other'.\n\n"
+
+            "SCORING LOGIC ('relevance_score'):\n"
+            "- Score 9-10 (Core R&D): Breakthroughs in web rendering engines, new cross-platform porting frameworks, Edge AI/NPU integration at the middleware level, or disruptive software developer tools (SWE).\n"
+            "- Score 7-8 (Ecosystem): Technologies highly applicable to Automotive Content Platforms, IoT ecosystems (like LG LUPA), or major SmartTV OS updates.\n"
+            "- Score 3-6 (Peripheral): General news in Robotics, Medical Displays, or HVAC.\n"
+            "- Score 0 (Excluded): Pure Audio/Acoustic technologies, or consumer-level updates about standalone laptops.\n\n"
+
+            "ANOMALY DETECTION ('anomalies_detected'):\n"
+            "Do not just summarize. Look for strategic or architectural disruptions. Examples of valid anomalies: "
+            "a competitor abandoning a proprietary OS for Chromium, sudden new industry standards in IoT/Middleware, "
+            "or unexpected convergence of WebTech with hardware (e.g., Medical/Automotive). "
+            "Ignore technical text formatting issues. Return an empty list [] if no strategic anomalies are found."
         )
         payload = {
             "model": os.environ.get('OLLAMA_MODEL', 'gpt-oss:120b-cloud'),
@@ -36,29 +51,29 @@ class OllamaProvider(ILLMProvider):
             "stream": False,
             "format": "json"
         }
-        
+
         async with aiohttp.ClientSession() as session:
             async with session.post(url, json=payload) as response:
                 response.raise_for_status()
                 data = await response.json()
-                
+
                 # Ollama returns the generated text inside 'response' key
                 generated_text = data.get('response', '')
                 if not generated_text:
                     generated_text = "{}"
-                
+
                 # Strip markdown code blocks
                 cleaned_text = generated_text.strip()
                 if cleaned_text.startswith("```json"):
                     cleaned_text = cleaned_text[7:]
                 elif cleaned_text.startswith("```"):
                     cleaned_text = cleaned_text[3:]
-                
+
                 if cleaned_text.endswith("```"):
                     cleaned_text = cleaned_text[:-3]
-                    
+
                 cleaned_text = cleaned_text.strip()
-                
+
                 try:
                     parsed_json = json.loads(cleaned_text)
                     if not isinstance(parsed_json, dict):
@@ -70,7 +85,7 @@ class OllamaProvider(ILLMProvider):
                         "anomalies_detected": [],
                         "category": "Other"
                     }
-                
+
                 return EnrichedNewsItemDTO(
                     title=news_item.title,
                     url=news_item.url,
@@ -79,6 +94,7 @@ class OllamaProvider(ILLMProvider):
                     timestamp=news_item.timestamp,
                     relevance_score=parsed_json.get('relevance_score', 0),
                     summary_ru=parsed_json.get('summary_ru', ''),
-                    anomalies_detected=parsed_json.get('anomalies_detected', []),
+                    anomalies_detected=parsed_json.get(
+                        'anomalies_detected', []),
                     category=parsed_json.get('category', 'Other')
                 )
diff --git a/tests/crawlers/test_factory.py b/tests/crawlers/test_factory.py
new file mode 100644
index 0000000..bf91847
--- /dev/null
+++ b/tests/crawlers/test_factory.py
@@ -0,0 +1,120 @@
+import pytest
+import yaml
+from unittest.mock import patch, mock_open
+from src.crawlers.factory import CrawlerFactory
+from src.crawlers.rss_crawler import RSSCrawler
+from src.crawlers.playwright_crawler import PlaywrightCrawler
+
+VALID_YAML = """
+crawlers:
+  - type: rss
+    url: "https://example.com/rss"
+    source: "Example RSS"
+  - type: playwright
+    url: "https://example.com/playwright"
+    source: "Example Playwright"
+    selector: ".item"
+"""
+
+INVALID_TYPE_YAML = """
+crawlers:
+  - type: unknown
+    url: "https://example.com/unknown"
+    source: "Unknown"
+  - type: rss
+    url: "https://example.com/rss"
+    source: "Example RSS"
+"""
+
+MALFORMED_YAML = """
+crawlers:
+  - type: rss
+  [ missing stuff ]
+"""
+
+MISSING_KEYS_YAML = """
+crawlers:
+  - type: rss
+    # url is missing
+    source: "Missing URL"
+  - url: "https://example.com/no-type"
+    source: "Missing Type"
+"""
+
+def test_load_from_yaml_valid():
+    with patch("builtins.open", mock_open(read_data=VALID_YAML)):
+        crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
+        
+        assert len(crawlers) == 2
+        assert isinstance(crawlers[0], RSSCrawler)
+        assert crawlers[0].url == "https://example.com/rss"
+        assert crawlers[0].source == "Example RSS"
+        
+        assert isinstance(crawlers[1], PlaywrightCrawler)
+        assert crawlers[1].url == "https://example.com/playwright"
+        assert crawlers[1].source == "Example Playwright"
+        assert crawlers[1].selector == ".item"
+
+def test_load_from_yaml_unknown_type():
+    with patch("builtins.open", mock_open(read_data=INVALID_TYPE_YAML)):
+        with patch("src.crawlers.factory.logger") as mock_logger:
+            crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
+            
+            assert len(crawlers) == 1
+            assert isinstance(crawlers[0], RSSCrawler)
+            mock_logger.warning.assert_called_with("Unknown crawler type: unknown")
+
+def test_load_from_yaml_malformed():
+    with patch("builtins.open", mock_open(read_data=MALFORMED_YAML)):
+        with patch("src.crawlers.factory.logger") as mock_logger:
+            crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
+            
+            assert crawlers == []
+            # Error log should be called due to yaml.ScannerError or similar
+            mock_logger.error.assert_called()
+
+def test_load_from_yaml_missing_keys():
+    with patch("builtins.open", mock_open(read_data=MISSING_KEYS_YAML)):
+        with patch("src.crawlers.factory.logger") as mock_logger:
+            crawlers = CrawlerFactory.load_from_yaml("dummy.yml")
+            
+            # First item missing url -> skipped with warning
+            # Second item missing type -> warning in else block
+            assert len(crawlers) == 0
+            
+            # Check for warnings
+            warning_calls = [call.args[0] for call in mock_logger.warning.call_args_list]
+            assert any("Missing mandatory fields" in msg for msg in warning_calls)
+            assert any("Unknown crawler type: None" in msg for msg in warning_calls)
+
+def test_load_from_yaml_file_not_found():
+    with patch("src.crawlers.factory.logger") as mock_logger:
+        # We don't need to patch open here, just call with non-existent file
+        crawlers = CrawlerFactory.load_from_yaml("non_existent_file_12345.yml")
+        assert crawlers == []
+        mock_logger.error.assert_called()
+
+def test_load_from_yaml_empty_file():
+    with patch("builtins.open", mock_open(read_data="")):
+        with patch("src.crawlers.factory.logger") as mock_logger:
+            crawlers = CrawlerFactory.load_from_yaml("empty.yml")
+            assert crawlers == []
+            mock_logger.warning.assert_called_with("Invalid or empty configuration in empty.yml")
+
+def test_integration_load_actual_config():
+    # This test verifies that the real src/crawlers.yml can be loaded without errors or warnings.
+    with patch("src.crawlers.factory.logger") as mock_logger:
+        crawlers = CrawlerFactory.load_from_yaml("src/crawlers.yml")
+        
+        assert len(crawlers) > 0
+        mock_logger.warning.assert_not_called()
+        mock_logger.error.assert_not_called()
+        
+        # Verify types and mandatory fields for all loaded crawlers
+        for crawler in crawlers:
+            assert isinstance(crawler, (RSSCrawler, PlaywrightCrawler))
+            assert crawler.url.startswith("http")
+            assert crawler.source
+            if isinstance(crawler, PlaywrightCrawler):
+                # According to src/crawlers.yml, all playwright crawlers currently have selectors
+                assert crawler.selector