Skip to content

Commit

Permalink
Merge pull request #255 from maheshpec/feature/configure-cache-directory
Browse files Browse the repository at this point in the history
feat(config): Adding a configurable way of setting the cache directory for constrained environments
  • Loading branch information
unclecode authored Nov 13, 2024
2 parents 8c22396 + 00026b5 commit 38044d4
Show file tree
Hide file tree
Showing 10 changed files with 14 additions and 14 deletions.
4 changes: 2 additions & 2 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:

if self.use_cached_html:
cache_file_path = os.path.join(
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
)
if os.path.exists(cache_file_path):
html = ""
Expand Down Expand Up @@ -725,7 +725,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:

if self.use_cached_html:
cache_file_path = os.path.join(
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
)
with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(html)
Expand Down
2 changes: 1 addition & 1 deletion crawl4ai/async_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

DB_PATH = os.path.join(Path.home(), ".crawl4ai")
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")

Expand Down
4 changes: 2 additions & 2 deletions crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ def __init__(
self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
always_by_pass_cache: bool = False,
base_directory: str = str(Path.home()),
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
**kwargs,
):
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
**kwargs
)
self.always_by_pass_cache = always_by_pass_cache
# self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
# self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
Expand Down
6 changes: 3 additions & 3 deletions crawl4ai/crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def __init__(self, use_cached_html=False, js_code=None, **kwargs):

# chromedriver_autoinstaller.install()
# import chromedriver_autoinstaller
# crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
# crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
# chromedriver_path = chromedriver_autoinstaller.install()
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
Expand Down Expand Up @@ -205,7 +205,7 @@ def crawl(self, url: str, **kwargs) -> str:
url_hash = hashlib.md5(url.encode()).hexdigest()

if self.use_cached_html:
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
if os.path.exists(cache_file_path):
with open(cache_file_path, "r") as f:
return sanitize_input_encode(f.read())
Expand Down Expand Up @@ -275,7 +275,7 @@ def crawl(self, url: str, **kwargs) -> str:
self.driver = self.execute_hook('before_return_html', self.driver, html)

# Store in cache
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(html)

Expand Down
2 changes: 1 addition & 1 deletion crawl4ai/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sqlite3
from typing import Optional, Tuple

DB_PATH = os.path.join(Path.home(), ".crawl4ai")
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")

Expand Down
2 changes: 1 addition & 1 deletion crawl4ai/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def set_model_device(model):

@lru_cache()
def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai")
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True)
os.makedirs(f"{home_folder}/cache", exist_ok=True)
os.makedirs(f"{home_folder}/models", exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class MEMORYSTATUSEX(ctypes.Structure):
raise OSError("Unsupported operating system")

def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai")
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True)
os.makedirs(f"{home_folder}/cache", exist_ok=True)
os.makedirs(f"{home_folder}/models", exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion crawl4ai/web_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class WebCrawler:
def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
self.always_by_pass_cache = always_by_pass_cache
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
init_db()
Expand Down
2 changes: 1 addition & 1 deletion docs/md_v2/api/async-webcrawler.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ AsyncWebCrawler(

# Cache Settings
always_by_pass_cache: bool = False, # Always bypass cache
base_directory: str = str(Path.home()), # Base directory for cache
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache

# Network Settings
proxy: str = None, # Simple proxy URL
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

# Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder
crawl4ai_folder = Path.home() / ".crawl4ai"
crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai"
cache_folder = crawl4ai_folder / "cache"

if cache_folder.exists():
Expand Down

0 comments on commit 38044d4

Please sign in to comment.