Skip to content

Commit

Permalink
Update for v0.2.2
Browse files Browse the repository at this point in the history
- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation
  • Loading branch information
unclecode committed Jun 2, 2024
1 parent f1b60b2 commit 51f26d1
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 32 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk)

## Recent Changes v0.2.0
## Recent Changes

### v0.2.2
- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation

### v0.2.0
- 🚀 10x faster!!
- 📜 Execute custom JavaScript before crawling!
- 🤝 Colab friendly!
Expand Down
8 changes: 7 additions & 1 deletion crawl4ai/crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,18 @@ def crawl(self, url: str) -> str:
)

# Execute JS code if provided
if self.js_code:
if self.js_code and type(self.js_code) == str:
self.driver.execute_script(self.js_code)
# Optionally, wait for some condition after executing the JS code
WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
elif self.js_code and type(self.js_code) == list:
for js in self.js_code:
self.driver.execute_script(js)
WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)

html = self.driver.page_source

Expand Down
17 changes: 9 additions & 8 deletions crawl4ai/extraction_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,14 +188,15 @@ def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2
if self.verbose:
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")

if False and self.device.type == "cpu":
self.model = load_onnx_all_MiniLM_l6_v2()
self.tokenizer = self.model.tokenizer
self.get_embedding_method = "direct"
else:
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.model.eval()
self.get_embedding_method = "batch"
# if False and self.device.type == "cpu":
# self.model = load_onnx_all_MiniLM_l6_v2()
# self.tokenizer = self.model.tokenizer
# self.get_embedding_method = "direct"
# else:

self.tokenizer, self.model = load_bge_small_en_v1_5()
self.model.eval()
self.get_embedding_method = "batch"

self.buffer_embeddings = np.array([])

Expand Down
7 changes: 3 additions & 4 deletions crawl4ai/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ def calculate_batch_size(device):
else:
return 32
else:
return 16 # Default batch size

return 16 # Default batch size

@lru_cache()
def get_device():
Expand Down Expand Up @@ -258,8 +257,8 @@ def download_all_models(remove_existing=False):
# load_bert_base_uncased()
# print("[LOG] Downloading BGE Small EN v1.5...")
# load_bge_small_en_v1_5()
print("[LOG] Downloading ONNX model...")
load_onnx_all_MiniLM_l6_v2()
# print("[LOG] Downloading ONNX model...")
# load_onnx_all_MiniLM_l6_v2()
print("[LOG] Downloading text classifier...")
_, device = load_text_multilabel_classifier()
print(f"[LOG] Text classifier loaded on {device}")
Expand Down
17 changes: 17 additions & 0 deletions docs/examples/quickstart.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,22 @@ def interactive_extraction(crawler):
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)

def multiple_scrip(crawler):
# Passing JavaScript code to interact with the page
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
js_code = ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""] * 2
crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(
url="https://www.nbcnews.com/business",
)
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)

def main():
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
Expand All @@ -180,6 +196,7 @@ def main():
add_llm_extraction_strategy(crawler)
targeted_extraction(crawler)
interactive_extraction(crawler)
multiple_scrip(crawler)

cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")

Expand Down
13 changes: 13 additions & 0 deletions requirements.crawl.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
aiohttp
aiosqlite
bs4
fastapi
html2text
httpx
pydantic
python-dotenv
requests
rich
selenium
uvicorn
chromedriver-autoinstaller
32 changes: 16 additions & 16 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
aiohttp==3.9.5
aiosqlite==0.20.0
bs4==0.0.2
fastapi==0.111.0
aiohttp
aiosqlite
bs4
fastapi
html2text
httpx==0.27.0
litellm==1.37.11
nltk==3.8.1
pydantic==2.7.1
python-dotenv==1.0.1
requests==2.31.0
rich==13.7.1
scikit-learn==1.4.2
selenium==4.20.0
uvicorn==0.29.0
transformers==4.40.2
chromedriver-autoinstaller==0.6.4
httpx
litellm
nltk
pydantic
python-dotenv
requests
rich
scikit-learn
selenium
uvicorn
transformers
chromedriver-autoinstaller
torch
onnxruntime
tokenizers
9 changes: 7 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
with open("requirements.txt") as f:
requirements = f.read().splitlines()

# Read the requirements from requirements.txt
with open("requirements.crawl.txt") as f:
requirements_crawl_only = f.read().splitlines()

# Define the requirements for different environments
requirements_without_torch = [req for req in requirements if not req.startswith("torch")]
requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")]
requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")]
requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]
requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")]

class CustomInstallCommand(install):
"""Customized setuptools install command to install spacy without dependencies."""
Expand All @@ -21,7 +26,7 @@ def run(self):

setup(
name="Crawl4AI",
version="0.2.1",
version="0.2.2",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
Expand All @@ -34,7 +39,7 @@ def run(self):
extras_require={
"all": requirements, # Include all requirements
"colab": requirements_without_torch, # Exclude torch for Colab
"crawl": requirements_without_torch_transformers_nlkt
"crawl": requirements_crawl_only, # Include only crawl requirements
},
cmdclass={
'install': CustomInstallCommand,
Expand Down

0 comments on commit 51f26d1

Please sign in to comment.