Update for v0.2.2

- Support multiple JS scripts - Fixed some of bugs - Resolved a few issue relevant to Colab installation
appbootup · Jun 2, 2024 · 51f26d1 · 51f26d1
1 parent f1b60b2
commit 51f26d1
Show file tree

Hide file tree

Showing 8 changed files with 79 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -10,8 +10,14 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)
 
-## Recent Changes v0.2.0
+## Recent Changes 
 
+### v0.2.2
+- Support multiple JS scripts
+- Fixed some of bugs
+- Resolved a few issue relevant to Colab installation
+
+### v0.2.0
 - 🚀 10x faster!!
 - 📜 Execute custom JavaScript before crawling!
 - 🤝 Colab friendly!

diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
@@ -103,12 +103,18 @@ def crawl(self, url: str) -> str:
             )
 
             # Execute JS code if provided
-            if self.js_code:
+            if self.js_code and type(self.js_code) == str:
                 self.driver.execute_script(self.js_code)
                 # Optionally, wait for some condition after executing the JS code
                 WebDriverWait(self.driver, 10).until(
                     lambda driver: driver.execute_script("return document.readyState") == "complete"
                 )
+            elif self.js_code and type(self.js_code) == list:
+                for js in self.js_code:
+                    self.driver.execute_script(js)
+                    WebDriverWait(self.driver, 10).until(
+                        lambda driver: driver.execute_script("return document.readyState") == "complete"
+                    )
 
             html = self.driver.page_source
 

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
@@ -188,14 +188,15 @@ def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2
         if self.verbose:
             print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
 
-        if False and self.device.type == "cpu":
-            self.model = load_onnx_all_MiniLM_l6_v2()
-            self.tokenizer = self.model.tokenizer
-            self.get_embedding_method = "direct"
-        else:
-            self.tokenizer, self.model = load_bge_small_en_v1_5()
-            self.model.eval()  
-            self.get_embedding_method = "batch"
+        # if False and self.device.type == "cpu":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+        # else:
+
+        self.tokenizer, self.model = load_bge_small_en_v1_5()
+        self.model.eval()  
+        self.get_embedding_method = "batch"
 
         self.buffer_embeddings = np.array([])
 

diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py
@@ -35,8 +35,7 @@ def calculate_batch_size(device):
         else:
             return 32
     else:
-        return 16  # Default batch size
-
+        return 16  # Default batch size   
 
 @lru_cache()
 def get_device():
@@ -258,8 +257,8 @@ def download_all_models(remove_existing=False):
     # load_bert_base_uncased()
     # print("[LOG] Downloading BGE Small EN v1.5...")
     # load_bge_small_en_v1_5()
-    print("[LOG] Downloading ONNX model...")
-    load_onnx_all_MiniLM_l6_v2()
+    # print("[LOG] Downloading ONNX model...")
+    # load_onnx_all_MiniLM_l6_v2()
     print("[LOG] Downloading text classifier...")
     _, device = load_text_multilabel_classifier()
     print(f"[LOG] Text classifier loaded on {device}")

diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
@@ -164,6 +164,22 @@ def interactive_extraction(crawler):
     cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
     print_result(result)
 
+def multiple_scrip(crawler):
+    # Passing JavaScript code to interact with the page
+    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
+    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
+    js_code = ["""
+    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
+    loadMoreButton && loadMoreButton.click();
+    """] * 2
+    crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+    crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+    result = crawler.run(
+        url="https://www.nbcnews.com/business",
+    )
+    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
+    print_result(result)
+
 def main():
     cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
     cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
@@ -180,6 +196,7 @@ def main():
     add_llm_extraction_strategy(crawler)
     targeted_extraction(crawler)
     interactive_extraction(crawler)
+    multiple_scrip(crawler)
 
     cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
 

diff --git a/requirements.crawl.txt b/requirements.crawl.txt
@@ -0,0 +1,13 @@
+aiohttp
+aiosqlite
+bs4
+fastapi
+html2text
+httpx
+pydantic
+python-dotenv
+requests
+rich
+selenium
+uvicorn
+chromedriver-autoinstaller
diff --git a/requirements.txt b/requirements.txt
@@ -1,20 +1,20 @@
-aiohttp==3.9.5
-aiosqlite==0.20.0
-bs4==0.0.2
-fastapi==0.111.0
+aiohttp
+aiosqlite
+bs4
+fastapi
 html2text
-httpx==0.27.0
-litellm==1.37.11
-nltk==3.8.1
-pydantic==2.7.1
-python-dotenv==1.0.1
-requests==2.31.0
-rich==13.7.1
-scikit-learn==1.4.2
-selenium==4.20.0
-uvicorn==0.29.0
-transformers==4.40.2
-chromedriver-autoinstaller==0.6.4
+httpx
+litellm
+nltk
+pydantic
+python-dotenv
+requests
+rich
+scikit-learn
+selenium
+uvicorn
+transformers
+chromedriver-autoinstaller
 torch
 onnxruntime
 tokenizers
diff --git a/setup.py b/setup.py
@@ -7,11 +7,16 @@
 with open("requirements.txt") as f:
     requirements = f.read().splitlines()
 
+# Read the requirements from requirements.txt
+with open("requirements.crawl.txt") as f:
+    requirements_crawl_only = f.read().splitlines()
+
 # Define the requirements for different environments
 requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
 requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
 requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
 requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
+requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
 
 class CustomInstallCommand(install):
     """Customized setuptools install command to install spacy without dependencies."""
@@ -21,7 +26,7 @@ def run(self):
 
 setup(
     name="Crawl4AI",
-    version="0.2.1",
+    version="0.2.2",
     description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",
@@ -34,7 +39,7 @@ def run(self):
     extras_require={
         "all": requirements,  # Include all requirements
         "colab": requirements_without_torch,  # Exclude torch for Colab
-        "crawl": requirements_without_torch_transformers_nlkt
+        "crawl": requirements_crawl_only,  # Include only crawl requirements
     },
     cmdclass={
         'install': CustomInstallCommand,